rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  555223 29.7    1243122 66.4   686457 36.7
## Vcells 1019470  7.8    8388608 64.0  1876634 14.4
library(magrittr)
library(data.table)
library(knitr)

`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

  • lowercase, protein based IDs!
# fp = file.path('..', 'input', 'ath-annot')
fp = file.path('..', 'input', 'Mercator')
# fn = 'ath_Araport11_2018-05-25_mapping.txt.gz'
# fn = 'X4.4_Arabidopsis_thaliana.txt'
fn = 'ath_Mercator4v7_results.txt'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

combined$IDENTIFIER = sub("\\'", '', sub("\\..*$", "", toupper(combined$IDENTIFIER)))
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)
colnames(combined)[2:4] = paste('ath', colnames(combined)[2:4], sep = '_')

ath.gmm = combined

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

Note: be careful with 35.2 bin matches

3 Abbreviations

Plant Name Label JCVI-MCScan Compara Plants Plaza OrthoDB FastOma RBH Mercator
Malus domestica apple mdo_GDv1 malus_domestica_golden mdo mdo mdo mdo mdo
mdo_HChap1
Prunus persica ppe ppe prunus_persica ppe ppe pper ppe pper
Prunus dulcis / P. amygdalus almond almond prunus_dulcis pdul pdul pdul pdul
Prunus avium wild cherry wildcherry prunus_avium pavi pavi pavi pavi
Prunus armeniaca apricot apricot parm parm parm parm
Prunus cerasifera cherry plum cherryplum pcer pcer pcer
Pyrus pear pear pcox pcox pcox
Prunus sibirica Siberian apricot siberianapricot psib psib psib
# in OrthoDB and PLAZA

# mdo_HChap1/mdo/mdo
# ppe/ppe/pper


# in OrthoDB, no PLAZA
# pdul/pdul/pdul
# wildcherry/pavi/pavi
# apricot/parm/parm



# not in OrthoDB or PLAZA
# pear/pcox/pcox
# cherryplum/pcer/pcer
# siberianapricot/psib/psib
params_list <- list(

  plantName1 = 'mdo'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'malus_domestica_golden'
  , # change name - compara # sources
  plantName3 = 'mdo_HChap1'
  ,  # change name - MCScanX # sources
  plantName4 = 'mdo'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "mdo_apple"
  ,# inconsistent-IDs, orthofinder
  plantNameOut = "apple"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "apple")
  ,
  
  pattern_in = "\\.[^.]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = '\\..*'
  ,
  compara_pattern_out1 = ""
  ,
  compara_pattern_in2 = ".*_"
  ,
  compara_pattern_out2 = ""
  ,
  plaza_pattern_in1 = '\\..*'
  ,
  plaza_pattern_in2 = ".* "
  ,
  
  
  ref_genome = "g.Honeycrisp_HAP1_braker1+2_combined_fullSupport_longname_filtered.pep"
  , # inconsistent-IDs
  
  mercator = 'mdo_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "a.g"
  , # plant-gmm
  mercatorPatternOut2 = "A.g"
  , # plant-gmm
  flag1 = 1
  ,
  flag2 = 1
  ,
  flag3 = FALSE

)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000273e7f90938>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-10] | |….. | 9% | |…… | 12% [unnamed-chunk-11] | |…….. | 15% | |……… | 18% [unnamed-chunk-12] | |……….. | 21% | |………… | 24% [unnamed-chunk-13] | |………….. | 27% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-15] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-16] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-17] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-18] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-19] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-20] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-21] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-22] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-23] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-24] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-25] | |……………………………………………| 100%

cat(child_content)

4 Subsection: mdo

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           22396           79898           60472           56069           33104 
##             RBH 
##           37682
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID                   to_protID          source
##    <chr>       <chr>       <chr>                       <chr>              <chr> 
##  1 <NA>        AT4G10330.1 <NA>                        Maldo.hc.v1a1.ch1… FastO…
##  2 <NA>        AT4G10330.1 <NA>                        Maldo.hc.v1a1.ch1… FastO…
##  3 <NA>        AT5G44410.1 <NA>                        g1.t1              FastO…
##  4 <NA>        AT5G44440.2 <NA>                        g1.t1              FastO…
##  5 <NA>        AT1G71250.1 <NA>                        Maldo.hc.v1a1.ch1… MCSca…
##  6 <NA>        AT1G71250.1 <NA>                        Maldo.hc.v1a1.ch1… MCSca…
##  7 <NA>        AT2G07655.1 <NA>                        Maldo.hc.v1a1.sc4… MCSca…
##  8 <NA>        AT2G07655.1 <NA>                        Maldo.hc.v1a1.sc4… MCSca…
##  9 AT2G46320   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 10 AT4G27940   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 11 AT2G07695   <NA>        Maldo.hc.v1a1.sc164A.g48922 <NA>               Ortho…
## 12 AT2G07695   <NA>        Maldo.hc.v1a1.sc119A.g48697 <NA>               Ortho…
## 13 AT1G01020   <NA>        MD01G1091900                <NA>               PLAZA 
## 14 AT1G01020   <NA>        MD07G1162500                <NA>               PLAZA 
## 15 AT1G16360   <NA>        MD17G1286800                <NA>               PLAZA 
## 16 AT1G79450   <NA>        MD17G1286800                <NA>               PLAZA 
## 17 AT1G01020   <NA>        Maldo.hc.v1a1.ch7A.g41990   <NA>               RBH   
## 18 AT1G01030   <NA>        Maldo.hc.v1a1.ch1A.g25187   <NA>               RBH   
## 19 ATMG01410   <NA>        Maldo.hc.v1a1.sc36A.g49760  <NA>               RBH   
## 20 ATMG01410   <NA>        Maldo.hc.v1a1.sc71A.g49908  <NA>               RBH   
## 21 AT1G01020   AT1G01020.1 MD01G0070900                mRNA:MD01G0070900  ensem…
## 22 AT1G01050   AT1G01050.1 MD07G0133100                mRNA:MD07G0133100  ensem…
## 23 AT5G67630   AT5G67630.1 MD02G0158200                mRNA:MD02G0158200  ensem…
## 24 AT5G67630   AT5G67630.1 MD15G0258100                mRNA:MD15G0258100  ensem…

4.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##      12
print(df[ind, ])
##         from_geneID     from_protID to_geneID                     to_protID
##              <char>          <char>    <char>                        <char>
##  1: AT1G25470.uORF1 AT1G25470.uORF1      <NA> Maldo.hc.v1a1.ch13A.g09400.t1
##  2: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Maldo.hc.v1a1.ch13A.g09400.t1
##  3: AT4G30960.uORF1 AT4G30960.uORF1      <NA> Maldo.hc.v1a1.ch15A.g15777.t1
##  4: AT1G58120.uORF1 AT1G58120.uORF1      <NA> Maldo.hc.v1a1.ch15A.g18095.t1
##  5: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Maldo.hc.v1a1.ch16A.g19057.t1
##  6: AT4G25670.uORF1 AT4G25670.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  7: AT4G25690.uORF1 AT4G25690.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  8: AT5G52550.uORF1 AT5G52550.uORF1      <NA>  Maldo.hc.v1a1.ch1A.g26199.t1
##  9: AT4G30960.uORF1 AT4G30960.uORF1      <NA>  Maldo.hc.v1a1.ch2A.g26596.t1
## 10: AT4G25670.uORF1 AT4G25670.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
## 11: AT4G25690.uORF1 AT4G25690.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
## 12: AT5G52550.uORF1 AT5G52550.uORF1      <NA>  Maldo.hc.v1a1.ch7A.g43057.t1
##      source
##      <char>
##  1: MCScanX
##  2: MCScanX
##  3: MCScanX
##  4: MCScanX
##  5: MCScanX
##  6: MCScanX
##  7: MCScanX
##  8: MCScanX
##  9: MCScanX
## 10: MCScanX
## 11: MCScanX
## 12: MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID                   to_protID          source
##    <chr>       <chr>       <chr>                       <chr>              <chr> 
##  1 AT4G10330   AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003  Maldo.hc.v1a1.ch1… FastO…
##  2 AT4G10330   AT4G10330.1 Maldo.hc.v1a1.ch10A.g00003  Maldo.hc.v1a1.ch1… FastO…
##  3 AT5G44410   AT5G44410.1 g1                          g1.t1              FastO…
##  4 AT5G44440   AT5G44440.2 g1                          g1.t1              FastO…
##  5 AT1G71250   AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019  Maldo.hc.v1a1.ch1… MCSca…
##  6 AT1G71250   AT1G71250.1 Maldo.hc.v1a1.ch10A.g00019  Maldo.hc.v1a1.ch1… MCSca…
##  7 AT2G07655   AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893  Maldo.hc.v1a1.sc4… MCSca…
##  8 AT2G07655   AT2G07655.1 Maldo.hc.v1a1.sc45A.g49893  Maldo.hc.v1a1.sc4… MCSca…
##  9 AT2G46320   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 10 AT4G27940   <NA>        Maldo.hc.v1a1.ch1A.g26266   <NA>               Ortho…
## 11 AT2G07695   <NA>        Maldo.hc.v1a1.sc164A.g48922 <NA>               Ortho…
## 12 AT2G07695   <NA>        Maldo.hc.v1a1.sc119A.g48697 <NA>               Ortho…
## 13 AT1G01020   <NA>        MD01G1091900                <NA>               PLAZA 
## 14 AT1G01020   <NA>        MD07G1162500                <NA>               PLAZA 
## 15 AT1G16360   <NA>        MD17G1286800                <NA>               PLAZA 
## 16 AT1G79450   <NA>        MD17G1286800                <NA>               PLAZA 
## 17 AT1G01020   <NA>        Maldo.hc.v1a1.ch7A.g41990   <NA>               RBH   
## 18 AT1G01030   <NA>        Maldo.hc.v1a1.ch1A.g25187   <NA>               RBH   
## 19 ATMG01410   <NA>        Maldo.hc.v1a1.sc36A.g49760  <NA>               RBH   
## 20 ATMG01410   <NA>        Maldo.hc.v1a1.sc71A.g49908  <NA>               RBH   
## 21 AT1G01020   AT1G01020.1 MD01G0070900                mRNA:MD01G0070900  ensem…
## 22 AT1G01050   AT1G01050.1 MD07G0133100                mRNA:MD07G0133100  ensem…
## 23 AT5G67630   AT5G67630.1 MD02G0158200                mRNA:MD02G0158200  ensem…
## 24 AT5G67630   AT5G67630.1 MD15G0258100                mRNA:MD15G0258100  ensem…
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        56069
## 5:           PLAZA            0        33104
## 6:             RBH            0        37682

4.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}
## # A tibble: 24 × 3
##    from_geneID to_geneID                   source         
##    <chr>       <chr>                       <chr>          
##  1 AT4G10330   Maldo.hc.v1a1.ch10A.g00003  FastOMA        
##  2 AT4G10340   Maldo.hc.v1a1.ch10A.g00004  FastOMA        
##  3 AT5G44410   g1                          FastOMA        
##  4 AT5G44440   g1                          FastOMA        
##  5 AT1G71250   Maldo.hc.v1a1.ch10A.g00019  MCScanX        
##  6 AT4G10170   Maldo.hc.v1a1.ch10A.g00024  MCScanX        
##  7 ATMG01320   Maldo.hc.v1a1.sc45A.g49892  MCScanX        
##  8 AT2G07655   Maldo.hc.v1a1.sc45A.g49893  MCScanX        
##  9 AT2G46320   Maldo.hc.v1a1.ch1A.g26266   OrthoDB        
## 10 AT4G27940   Maldo.hc.v1a1.ch1A.g26266   OrthoDB        
## 11 AT2G07695   Maldo.hc.v1a1.sc164A.g48922 OrthoDB        
## 12 AT2G07695   Maldo.hc.v1a1.sc119A.g48697 OrthoDB        
## 13 AT1G01020   Maldo.hc.v1a1.ch1A.g25188   PLAZA          
## 14 AT1G01020   Maldo.hc.v1a1.ch7A.g41989   PLAZA          
## 15 AT1G16360   Maldo.hc.v1a1.ch17A.g24061  PLAZA          
## 16 AT1G79450   Maldo.hc.v1a1.ch17A.g24061  PLAZA          
## 17 AT1G01020   Maldo.hc.v1a1.ch7A.g41990   RBH            
## 18 AT1G01030   Maldo.hc.v1a1.ch1A.g25187   RBH            
## 19 ATMG01410   Maldo.hc.v1a1.sc36A.g49760  RBH            
## 20 ATMG01410   Maldo.hc.v1a1.sc71A.g49908  RBH            
## 21 AT1G01020   Maldo.hc.v1a1.ch1A.g25188   ensembl-compara
## 22 AT1G01050   Maldo.hc.v1a1.ch1A.g25184   ensembl-compara
## 23 AT5G67630   Maldo.hc.v1a1.ch2A.g27934   ensembl-compara
## 24 AT5G67630   Maldo.hc.v1a1.ch15A.g17150  ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1090541 58.3    2083210 111.3  2083210 111.3
## Vcells 4846724 37.0   16017148 122.3 16017148 122.3
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

4.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23053
length(unique(dt$to_geneID))
## [1] 34410
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           20997           77128           32045           56069           30068 
##             RBH 
##           37682
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##         from_geneID                  to_geneID FastOMA MCScanX OrthoDB  PLAZA
##              <char>                     <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>
##  1: AT1G25470.uORF1 Maldo.hc.v1a1.ch13A.g09400   FALSE    TRUE   FALSE  FALSE
##  2: AT1G58120.uORF1 Maldo.hc.v1a1.ch15A.g18095   FALSE    TRUE   FALSE  FALSE
##  3: AT1G68550.uORF1 Maldo.hc.v1a1.ch13A.g09400   FALSE    TRUE   FALSE  FALSE
##  4: AT1G68550.uORF1 Maldo.hc.v1a1.ch16A.g19057   FALSE    TRUE   FALSE  FALSE
##  5: AT4G25670.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
##  6: AT4G25670.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##  7: AT4G25690.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
##  8: AT4G25690.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##  9: AT4G30960.uORF1 Maldo.hc.v1a1.ch15A.g15777   FALSE    TRUE   FALSE  FALSE
## 10: AT4G30960.uORF1  Maldo.hc.v1a1.ch2A.g26596   FALSE    TRUE   FALSE  FALSE
## 11: AT5G52550.uORF1  Maldo.hc.v1a1.ch1A.g26199   FALSE    TRUE   FALSE  FALSE
## 12: AT5G52550.uORF1  Maldo.hc.v1a1.ch7A.g43057   FALSE    TRUE   FALSE  FALSE
##        RBH ensembl-compara count_evidence
##     <lgcl>          <lgcl>          <num>
##  1:  FALSE           FALSE              1
##  2:  FALSE           FALSE              1
##  3:  FALSE           FALSE              1
##  4:  FALSE           FALSE              1
##  5:  FALSE           FALSE              1
##  6:  FALSE           FALSE              1
##  7:  FALSE           FALSE              1
##  8:  FALSE           FALSE              1
##  9:  FALSE           FALSE              1
## 10:  FALSE           FALSE              1
## 11:  FALSE           FALSE              1
## 12:  FALSE           FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

4.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

4.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39
## [26] 31 2e 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 6d 61 6c 64 6f 2e 68 63 2e 76 31 61 31 2e 63 68 33 61 2e 67 33 31 32 39 31
## [26] 2e 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 14272 35833
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "ensembl-compara"
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 122701     27
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##  [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6627136 354.0   11731995  626.6  11731995  626.6
## Vcells 112262491 856.5  190628278 1454.4 158749983 1211.2
library(magrittr)
library(ggplot2)
library(ComplexUpset)

4.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

4.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23046
length(unique(df$to_geneID))
## [1] 34408
range(df$from_count)
## [1]   1 128
range(df$to_count)
## [1]   1 116
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 319
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 288
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          95834          16984           4836
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       47777    14508           4314
##   2       13486     1598            273
##   3        8903      517             98
##   4        8949      203             63
##   5       10121      123             56
##   6        6598       35             32
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5183            1882            4015
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5281                4015                1242                1072 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               32088                 579                5183                 670 
##               PLAZA         RBH_MapMan4              reject 
##                6577                1882               59065
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

4.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          53789           3695           1105
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       14991     1733            738
##   2        7682     1198            145
##   3        6646      416             80
##   4        8004      191             59
##   5        9868      122             51
##   6        6598       35             32
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 21121
length(unique(kept$to_geneID))
## [1] 33249
range(kept$from_count)
## [1]  1 93
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 32
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 28
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

4.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 163                  83                  71                  35 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                1337                  26                 122                  30 
##               PLAZA         RBH_MapMan4              reject 
##                 264                  29                1788
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName1 = 'ppe', # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_persica', # change name - compara # sources
  plantName3 = 'ppe',  # change name - MCScanX # sources
  plantName4 = 'pper',  # change name - FastOMA # sources
  
  plantDirIn = "ppe_peach", # inconsistent-IDs, orthofinder
  plantNameOut = "peach",
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "peach"),

  pattern_in = "\\.[^.]*$", # everythin after the last dot
  pattern_out = "", # all-IDs
  compara_pattern_in1 = "",
  compara_pattern_out1 = "",
  compara_pattern_in2 = "",
  compara_pattern_out2 = "",
  plaza_pattern_in1 = "",
  plaza_pattern_in2 = "",
  
  ref_genome = "PLAZA_proteome.selected_transcript.ppe", # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pper_Mercator4v7_results.txt', # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']", # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = "", # plant-gmm
  mercatorPatternIn2 = "g", # plant-gmm
  mercatorPatternOut2 = "G", # plant-gmm
  flag1 = 1,
  flag2 = 2,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000027486c87620>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-44] | |….. | 9% | |…… | 12% [unnamed-chunk-45] | |…….. | 15% | |……… | 18% [unnamed-chunk-46] | |……….. | 21% | |………… | 24% [unnamed-chunk-47] | |………….. | 27% | |…………… | 30% [unnamed-chunk-48] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-49] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-50] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-51] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-52] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-53] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-54] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-55] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-56] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-57] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-58] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-59] | |……………………………………………| 100%

cat(child_content)

5 Subsection: ppe

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           16851           44006           62733           38370           20774 
##             RBH 
##           24564
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID      to_protID        source         
##    <chr>       <chr>       <chr>          <chr>            <chr>          
##  1 <NA>        AT1G12040.1 <NA>           Prupe.1G000500.1 FastOMA        
##  2 <NA>        AT1G62440.1 <NA>           Prupe.1G000500.1 FastOMA        
##  3 <NA>        AT1G61010.3 <NA>           Prupe.I006100.1  FastOMA        
##  4 <NA>        AT1G61010.3 <NA>           Prupe.I006200.1  FastOMA        
##  5 <NA>        AT5G58130.1 <NA>           Prupe.1G000700.1 MCScanX        
##  6 <NA>        AT5G58110.1 <NA>           Prupe.1G000800.1 MCScanX        
##  7 <NA>        AT4G02060.1 <NA>           Prupe.8G272000.1 MCScanX        
##  8 <NA>        AT4G02060.2 <NA>           Prupe.8G272000.1 MCScanX        
##  9 AT3G17900   <NA>        Prupe.1G267800 <NA>             OrthoDB        
## 10 AT4G35230   <NA>        Prupe.1G355500 <NA>             OrthoDB        
## 11 AT2G07675   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 12 ATMG00980   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 13 AT1G01020   <NA>        Prupe.2G201100 <NA>             PLAZA          
## 14 AT1G01050   <NA>        Prupe.2G200700 <NA>             PLAZA          
## 15 AT1G52360   <NA>        Prupe.I003200  <NA>             PLAZA          
## 16 AT5G40850   <NA>        Prupe.I005100  <NA>             PLAZA          
## 17 AT1G01030   <NA>        Prupe.5G134900 <NA>             RBH            
## 18 AT1G01040   <NA>        Prupe.2G200900 <NA>             RBH            
## 19 ATMG01250   <NA>        Prupe.6G123900 <NA>             RBH            
## 20 ATMG01250   <NA>        Prupe.7G164000 <NA>             RBH            
## 21 AT1G01020   AT1G01020.1 PRUPE_2G201100 ONI23664         ensembl-compara
## 22 AT1G01040   AT1G01040.2 PRUPE_2G200900 ONI23660         ensembl-compara
## 23 AT5G67620   AT5G67620.1 PRUPE_6G219300 ONI02738         ensembl-compara
## 24 AT5G67630   AT5G67630.1 PRUPE_1G544700 ONI35595         ensembl-compara

5.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##      23
print(df[ind, ])
##         from_geneID     from_protID to_geneID        to_protID  source
##              <char>          <char>    <char>           <char>  <char>
##  1: AT3G25570.uORF1 AT3G25570.uORF1      <NA> Prupe.1G299600.1 MCScanX
##  2: AT1G25470.uORF1 AT1G25470.uORF1      <NA> Prupe.1G310000.1 MCScanX
##  3: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Prupe.1G310000.1 MCScanX
##  4: AT1G23150.uORF1 AT1G23150.uORF1      <NA> Prupe.1G329400.1 MCScanX
##  5: AT1G70780.uORF1 AT1G70780.uORF1      <NA> Prupe.1G329400.1 MCScanX
##  6: AT1G75390.uORF1 AT1G75390.uORF1      <NA> Prupe.1G374500.1 MCScanX
##  7: AT5G50010.uORF2 AT5G50010.uORF2      <NA> Prupe.1G527700.1 MCScanX
##  8: AT4G25670.uORF1 AT4G25670.uORF1      <NA> Prupe.2G300500.1 MCScanX
##  9: AT4G25690.uORF1 AT4G25690.uORF1      <NA> Prupe.2G300500.1 MCScanX
## 10: AT5G52550.uORF1 AT5G52550.uORF1      <NA> Prupe.2G300500.1 MCScanX
## 11: AT5G53590.uORF1 AT5G53590.uORF1      <NA> Prupe.2G317000.1 MCScanX
## 12: AT3G02470.uORF1 AT3G02470.uORF1      <NA> Prupe.3G243800.1 MCScanX
## 13: AT5G15950.uORF1 AT5G15950.uORF1      <NA> Prupe.3G243800.1 MCScanX
## 14: AT1G29950.uORF2 AT1G29950.uORF2      <NA> Prupe.4G077000.1 MCScanX
## 15: AT4G19110.uORF1 AT4G19110.uORF1      <NA> Prupe.5G021200.1 MCScanX
## 16: AT5G45430.uORF1 AT5G45430.uORF1      <NA> Prupe.5G021200.1 MCScanX
## 17: AT2G27230.uORF1 AT2G27230.uORF1      <NA> Prupe.6G144400.1 MCScanX
## 18: AT3G12010.uORF1 AT3G12010.uORF1      <NA> Prupe.7G082600.1 MCScanX
## 19: AT4G36990.uORF1 AT4G36990.uORF1      <NA> Prupe.7G133500.1 MCScanX
## 20: AT2G18160.uORF1 AT2G18160.uORF1      <NA> Prupe.7G160500.1 MCScanX
## 21: AT5G09460.uORF1 AT5G09460.uORF1      <NA> Prupe.8G067700.1 MCScanX
## 22: AT5G64340.uORF1 AT5G64340.uORF1      <NA> Prupe.8G067700.1 MCScanX
## 23: AT4G34590.uORF1 AT4G34590.uORF1      <NA> Prupe.8G091700.1 MCScanX
##         from_geneID     from_protID to_geneID        to_protID  source
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_protID to_geneID      to_protID        source         
##    <chr>       <chr>       <chr>          <chr>            <chr>          
##  1 AT1G12040   AT1G12040.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA        
##  2 AT1G62440   AT1G62440.1 Prupe.1G000500 Prupe.1G000500.1 FastOMA        
##  3 AT1G61010   AT1G61010.3 Prupe.I006100  Prupe.I006100.1  FastOMA        
##  4 AT1G61010   AT1G61010.3 Prupe.I006200  Prupe.I006200.1  FastOMA        
##  5 AT5G58130   AT5G58130.1 Prupe.1G000700 Prupe.1G000700.1 MCScanX        
##  6 AT5G58110   AT5G58110.1 Prupe.1G000800 Prupe.1G000800.1 MCScanX        
##  7 AT4G02060   AT4G02060.1 Prupe.8G272000 Prupe.8G272000.1 MCScanX        
##  8 AT4G02060   AT4G02060.2 Prupe.8G272000 Prupe.8G272000.1 MCScanX        
##  9 AT3G17900   <NA>        Prupe.1G267800 <NA>             OrthoDB        
## 10 AT4G35230   <NA>        Prupe.1G355500 <NA>             OrthoDB        
## 11 AT2G07675   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 12 ATMG00980   <NA>        Prupe.6G146800 <NA>             OrthoDB        
## 13 AT1G01020   <NA>        Prupe.2G201100 <NA>             PLAZA          
## 14 AT1G01050   <NA>        Prupe.2G200700 <NA>             PLAZA          
## 15 AT1G52360   <NA>        Prupe.I003200  <NA>             PLAZA          
## 16 AT5G40850   <NA>        Prupe.I005100  <NA>             PLAZA          
## 17 AT1G01030   <NA>        Prupe.5G134900 <NA>             RBH            
## 18 AT1G01040   <NA>        Prupe.2G200900 <NA>             RBH            
## 19 ATMG01250   <NA>        Prupe.6G123900 <NA>             RBH            
## 20 ATMG01250   <NA>        Prupe.7G164000 <NA>             RBH            
## 21 AT1G01020   AT1G01020.1 PRUPE_2G201100 ONI23664         ensembl-compara
## 22 AT1G01040   AT1G01040.2 PRUPE_2G200900 ONI23660         ensembl-compara
## 23 AT5G67620   AT5G67620.1 PRUPE_6G219300 ONI02738         ensembl-compara
## 24 AT5G67630   AT5G67630.1 PRUPE_1G544700 ONI35595         ensembl-compara
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        38370
## 5:           PLAZA            0        20774
## 6:             RBH            0        24564

5.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}
## # A tibble: 24 × 3
##    from_geneID to_geneID      source         
##    <chr>       <chr>          <chr>          
##  1 AT1G12040   Prupe.1G000500 FastOMA        
##  2 AT1G62440   Prupe.1G000500 FastOMA        
##  3 AT1G61010   Prupe.I006100  FastOMA        
##  4 AT1G61010   Prupe.I006200  FastOMA        
##  5 AT5G58130   Prupe.1G000700 MCScanX        
##  6 AT5G58110   Prupe.1G000800 MCScanX        
##  7 AT3G62540   Prupe.8G271400 MCScanX        
##  8 AT4G02060   Prupe.8G272000 MCScanX        
##  9 AT3G17900   Prupe.1G267800 OrthoDB        
## 10 AT4G35230   Prupe.1G355500 OrthoDB        
## 11 AT2G07675   Prupe.6G146800 OrthoDB        
## 12 ATMG00980   Prupe.6G146800 OrthoDB        
## 13 AT1G01020   Prupe.2G201100 PLAZA          
## 14 AT1G01050   Prupe.2G200700 PLAZA          
## 15 AT1G52360   Prupe.I003200  PLAZA          
## 16 AT5G40850   Prupe.I005100  PLAZA          
## 17 AT1G01030   Prupe.5G134900 RBH            
## 18 AT1G01040   Prupe.2G200900 RBH            
## 19 ATMG01250   Prupe.6G123900 RBH            
## 20 ATMG01250   Prupe.7G164000 RBH            
## 21 AT1G01020   Prupe.2G201100 ensembl-compara
## 22 AT1G01040   Prupe.2G200900 ensembl-compara
## 23 AT5G67620   Prupe.6G219300 ensembl-compara
## 24 AT5G67630   Prupe.1G544700 ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4673315 249.6    9385596  501.3  11731995  626.6
## Vcells 120390134 918.6  193111802 1473.4 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

5.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23136
length(unique(dt$to_geneID))
## [1] 21243
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB           PLAZA 
##           16193           44006           17894           38370           20774 
##             RBH 
##           24564
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##         from_geneID      to_geneID FastOMA MCScanX OrthoDB  PLAZA    RBH
##              <char>         <char>  <lgcl>  <lgcl>  <lgcl> <lgcl> <lgcl>
##  1: AT1G23150.uORF1 Prupe.1G329400   FALSE    TRUE   FALSE  FALSE  FALSE
##  2: AT1G25470.uORF1 Prupe.1G310000   FALSE    TRUE   FALSE  FALSE  FALSE
##  3: AT1G29950.uORF2 Prupe.4G077000   FALSE    TRUE   FALSE  FALSE  FALSE
##  4: AT1G68550.uORF1 Prupe.1G310000   FALSE    TRUE   FALSE  FALSE  FALSE
##  5: AT1G70780.uORF1 Prupe.1G329400   FALSE    TRUE   FALSE  FALSE  FALSE
##  6: AT1G75390.uORF1 Prupe.1G374500   FALSE    TRUE   FALSE  FALSE  FALSE
##  7: AT2G18160.uORF1 Prupe.7G160500   FALSE    TRUE   FALSE  FALSE  FALSE
##  8: AT2G27230.uORF1 Prupe.6G144400   FALSE    TRUE   FALSE  FALSE  FALSE
##  9: AT3G02470.uORF1 Prupe.3G243800   FALSE    TRUE   FALSE  FALSE  FALSE
## 10: AT3G12010.uORF1 Prupe.7G082600   FALSE    TRUE   FALSE  FALSE  FALSE
## 11: AT3G25570.uORF1 Prupe.1G299600   FALSE    TRUE   FALSE  FALSE  FALSE
## 12: AT4G19110.uORF1 Prupe.5G021200   FALSE    TRUE   FALSE  FALSE  FALSE
## 13: AT4G25670.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 14: AT4G25690.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 15: AT4G34590.uORF1 Prupe.8G091700   FALSE    TRUE   FALSE  FALSE  FALSE
## 16: AT4G36990.uORF1 Prupe.7G133500   FALSE    TRUE   FALSE  FALSE  FALSE
## 17: AT5G09460.uORF1 Prupe.8G067700   FALSE    TRUE   FALSE  FALSE  FALSE
## 18: AT5G15950.uORF1 Prupe.3G243800   FALSE    TRUE   FALSE  FALSE  FALSE
## 19: AT5G45430.uORF1 Prupe.5G021200   FALSE    TRUE   FALSE  FALSE  FALSE
## 20: AT5G50010.uORF2 Prupe.1G527700   FALSE    TRUE   FALSE  FALSE  FALSE
## 21: AT5G52550.uORF1 Prupe.2G300500   FALSE    TRUE   FALSE  FALSE  FALSE
## 22: AT5G53590.uORF1 Prupe.2G317000   FALSE    TRUE   FALSE  FALSE  FALSE
## 23: AT5G64340.uORF1 Prupe.8G067700   FALSE    TRUE   FALSE  FALSE  FALSE
##         from_geneID      to_geneID FastOMA MCScanX OrthoDB  PLAZA    RBH
##     ensembl-compara count_evidence
##              <lgcl>          <num>
##  1:           FALSE              1
##  2:           FALSE              1
##  3:           FALSE              1
##  4:           FALSE              1
##  5:           FALSE              1
##  6:           FALSE              1
##  7:           FALSE              1
##  8:           FALSE              1
##  9:           FALSE              1
## 10:           FALSE              1
## 11:           FALSE              1
## 12:           FALSE              1
## 13:           FALSE              1
## 14:           FALSE              1
## 15:           FALSE              1
## 16:           FALSE              1
## 17:           FALSE              1
## 18:           FALSE              1
## 19:           FALSE              1
## 20:           FALSE              1
## 21:           FALSE              1
## 22:           FALSE              1
## 23:           FALSE              1
##     ensembl-compara count_evidence
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

5.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

5.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 72 75 70 65 2e 33 67 30 30 34 31 30 30 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
##  5670 21203
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "ensembl-compara"
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 71187   231
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
##   [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
##   [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
##  [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
##  [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
##  [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
##  [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4819536 257.4    9385596  501.3  11731995  626.6
## Vcells 77106368 588.3  193111802 1473.4 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

5.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

5.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23113
length(unique(df$to_geneID))
## [1] 21227
range(df$from_count)
## [1]  1 57
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 264
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 135
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          60256           8696           2466
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       28995     7390           2167
##   2        8716      797            143
##   3        5356      231             64
##   4        5513      140             40
##   5        6882       93             32
##   6        4794       45             20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            3923            1100            1947
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                5084                1947                1109                 231 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               17871                 415                3923                 516 
##               PLAZA         RBH_MapMan4              reject 
##                4912                1100               34310
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

5.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34589           1921            598
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9932      949            397
##   2        4832      530             62
##   3        3682      179             49
##   4        4769      125             38
##   5        6580       93             32
##   6        4794       45             20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20588
length(unique(kept$to_geneID))
## [1] 20794
range(kept$from_count)
## [1]  1 50
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 7
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

5.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 145                  38                  43                   7 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 728                  27                  51                  16 
##               PLAZA         RBH_MapMan4              reject 
##                 166                  19                1154
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pdul'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_dulcis'
  , # change name - compara # sources
  plantName3 = 'almond'
  ,  # change name - MCScanX # sources
  plantName4 = 'pdul'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "pdul_almond"
  , # inconsistent-IDs, orthofinder
  plantNameOut = "almond"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "almond")
  ,

  pattern_in = "\\.[^.]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = ""
  ,
  compara_pattern_out1 = ""
  ,
  compara_pattern_in2 = ""
  ,
  compara_pattern_out2 = ""
  ,
  plaza_pattern_in1 = ""
  ,
  plaza_pattern_in2 = ""
  ,
  
  ref_genome = "Texas_F1_protein"
  , # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pdul_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "([fg])"
  , # plant-gmm
  mercatorPatternOut2 = "\\U\\1" # plant-gmm
  ,
  flag1 = 2
  ,
  flag2 = 2
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000273f6492a50>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-78] | |….. | 9% | |…… | 12% [unnamed-chunk-79] | |…….. | 15% | |……… | 18% [unnamed-chunk-80] | |……….. | 21% | |………… | 24% [unnamed-chunk-81] | |………….. | 27% | |…………… | 30% [unnamed-chunk-82] | |…………….. | 33% | |………………. | 36% [unnamed-chunk-83] | |……………….. | 39% | |…………………. | 42% [unnamed-chunk-84] | |………………….. | 45% | |……………………. | 48% [unnamed-chunk-85] | |…………………….. | 52% | |………………………. | 55% [unnamed-chunk-86] | |……………………….. | 58% | |…………………………. | 61% [unnamed-chunk-87] | |………………………….. | 64% | |……………………………. | 67% [unnamed-chunk-88] | |……………………………… | 70% | |………………………………. | 73% [unnamed-chunk-89] | |………………………………… | 76% | |…………………………………. | 79% [unnamed-chunk-90] | |…………………………………… | 82% | |……………………………………. | 85% [unnamed-chunk-91] | |……………………………………… | 88% | |………………………………………. | 91% [unnamed-chunk-92] | |………………………………………… | 94% | |…………………………………………. | 97% [unnamed-chunk-93] | |……………………………………………| 100%

cat(child_content)

6 Subsection: pdul

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

6.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##           16421           42927           36681           35734           24829
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID       to_protID        source         
##    <chr>       <chr>       <chr>           <chr>            <chr>          
##  1 <NA>        AT2G05620.2 <NA>            TexasF1_G1000.1  FastOMA        
##  2 <NA>        AT3G48880.2 <NA>            TexasF1_G10001.1 FastOMA        
##  3 <NA>        AT3G48890.1 <NA>            TexasF1_G9999.1  FastOMA        
##  4 <NA>        AT5G52240.1 <NA>            TexasF1_G9999.1  FastOMA        
##  5 <NA>        AT5G22060.1 <NA>            TexasF1_G100.1   MCScanX        
##  6 <NA>        AT3G48880.1 <NA>            TexasF1_G10000.1 MCScanX        
##  7 <NA>        AT5G52240.1 <NA>            TexasF1_G9999.1  MCScanX        
##  8 <NA>        AT5G52240.2 <NA>            TexasF1_G9999.1  MCScanX        
##  9 AT1G23390   <NA>        TexasF1_G3359   <NA>             OrthoDB        
## 10 AT5G19210   <NA>        TexasF1_G2060   <NA>             OrthoDB        
## 11 AT3G51810   <NA>        TexasF1_G23162  <NA>             OrthoDB        
## 12 AT2G28815   <NA>        TexasF1_G6420   <NA>             OrthoDB        
## 13 AT1G01030   <NA>        TexasF1_G18833  <NA>             RBH            
## 14 AT1G01040   <NA>        TexasF1_G9057   <NA>             RBH            
## 15 ATMG00860   <NA>        TexasF1_G25095  <NA>             RBH            
## 16 ATMG01250   <NA>        TexasF1_G22012  <NA>             RBH            
## 17 AT1G01020   AT1G01020.1 Prudul26B025674 VVA35962         ensembl-compara
## 18 AT1G01040   AT1G01040.2 Prudul26B022109 VVA35960         ensembl-compara
## 19 AT5G67620   AT5G67620.1 Prudul26B020268 VVA23378         ensembl-compara
## 20 AT5G67630   AT5G67630.1 Prudul26B028327 VVA20286         ensembl-compara

6.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       5
print(df[ind, ])
##        from_geneID     from_protID to_geneID        to_protID  source
##             <char>          <char>    <char>           <char>  <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1      <NA> TexasF1_G17548.1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1      <NA> TexasF1_G17548.1 MCScanX
## 3: AT1G48600.uORF1 AT1G48600.uORF1      <NA> TexasF1_G19862.1 MCScanX
## 4: AT1G06150.uORF1 AT1G06150.uORF1      <NA> TexasF1_G29714.1 MCScanX
## 5: AT2G31280.uORF1 AT2G31280.uORF1      <NA> TexasF1_G29714.1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID       to_protID        source         
##    <chr>       <chr>       <chr>           <chr>            <chr>          
##  1 AT2G05620   AT2G05620.2 TexasF1_G1000   TexasF1_G1000.1  FastOMA        
##  2 AT3G48880   AT3G48880.2 TexasF1_G10001  TexasF1_G10001.1 FastOMA        
##  3 AT3G48890   AT3G48890.1 TexasF1_G9999   TexasF1_G9999.1  FastOMA        
##  4 AT5G52240   AT5G52240.1 TexasF1_G9999   TexasF1_G9999.1  FastOMA        
##  5 AT5G22060   AT5G22060.1 TexasF1_G100    TexasF1_G100.1   MCScanX        
##  6 AT3G48880   AT3G48880.1 TexasF1_G10000  TexasF1_G10000.1 MCScanX        
##  7 AT5G52240   AT5G52240.1 TexasF1_G9999   TexasF1_G9999.1  MCScanX        
##  8 AT5G52240   AT5G52240.2 TexasF1_G9999   TexasF1_G9999.1  MCScanX        
##  9 AT1G23390   <NA>        TexasF1_G3359   <NA>             OrthoDB        
## 10 AT5G19210   <NA>        TexasF1_G2060   <NA>             OrthoDB        
## 11 AT3G51810   <NA>        TexasF1_G23162  <NA>             OrthoDB        
## 12 AT2G28815   <NA>        TexasF1_G6420   <NA>             OrthoDB        
## 13 AT1G01030   <NA>        TexasF1_G18833  <NA>             RBH            
## 14 AT1G01040   <NA>        TexasF1_G9057   <NA>             RBH            
## 15 ATMG00860   <NA>        TexasF1_G25095  <NA>             RBH            
## 16 ATMG01250   <NA>        TexasF1_G22012  <NA>             RBH            
## 17 AT1G01020   AT1G01020.1 Prudul26B025674 VVA35962         ensembl-compara
## 18 AT1G01040   AT1G01040.2 Prudul26B022109 VVA35960         ensembl-compara
## 19 AT5G67620   AT5G67620.1 Prudul26B020268 VVA23378         ensembl-compara
## 20 AT5G67630   AT5G67630.1 Prudul26B028327 VVA20286         ensembl-compara
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        35734
## 5:             RBH            0        24829

6.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}
## # A tibble: 20 × 3
##    from_geneID to_geneID      source         
##    <chr>       <chr>          <chr>          
##  1 AT2G05620   TexasF1_G1000  FastOMA        
##  2 AT3G48880   TexasF1_G10001 FastOMA        
##  3 AT3G48890   TexasF1_G9999  FastOMA        
##  4 AT5G52240   TexasF1_G9999  FastOMA        
##  5 AT5G22060   TexasF1_G100   MCScanX        
##  6 AT3G48880   TexasF1_G10000 MCScanX        
##  7 AT3G48890   TexasF1_G9999  MCScanX        
##  8 AT5G52240   TexasF1_G9999  MCScanX        
##  9 AT1G23390   TexasF1_G3359  OrthoDB        
## 10 AT5G19210   TexasF1_G2060  OrthoDB        
## 11 AT3G51810   TexasF1_G23162 OrthoDB        
## 12 AT2G28815   TexasF1_G6420  OrthoDB        
## 13 AT1G01030   TexasF1_G18833 RBH            
## 14 AT1G01040   TexasF1_G9057  RBH            
## 15 ATMG00860   TexasF1_G25095 RBH            
## 16 ATMG01250   TexasF1_G22012 RBH            
## 17 AT1G01020   TexasF1_G9059  ensembl-compara
## 18 AT1G01040   TexasF1_G9057  ensembl-compara
## 19 AT5G67630   TexasF1_G6308  ensembl-compara
## 20 AT5G67630   TexasF1_G6731  ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3604195 192.5    7508477  401.0  11731995  626.6
## Vcells 81384363 621.0  154489442 1178.7 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

6.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22456
length(unique(dt$to_geneID))
## [1] 20903
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##           15975           42927           20151           35734           24829
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

6.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

6.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID      to_geneID FastOMA MCScanX OrthoDB    RBH
##             <char>         <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>
## 1: AT1G06150.uORF1 TexasF1_G29714   FALSE    TRUE   FALSE  FALSE
## 2: AT1G48600.uORF1 TexasF1_G19862   FALSE    TRUE   FALSE  FALSE
## 3: AT2G31280.uORF1 TexasF1_G29714   FALSE    TRUE   FALSE  FALSE
## 4: AT4G19110.uORF1 TexasF1_G17548   FALSE    TRUE   FALSE  FALSE
## 5: AT5G45430.uORF1 TexasF1_G17548   FALSE    TRUE   FALSE  FALSE
##    ensembl-compara count_evidence
##             <lgcl>          <num>
## 1:           FALSE              1
## 2:           FALSE              1
## 3:           FALSE              1
## 4:           FALSE              1
## 5:           FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

6.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

6.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

6.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 74 65 78 61 73 66 31 5f 67 31 30 34 30 31 2e 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
##  8713 20903
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "ensembl-compara" "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 67028
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3140283 167.8    7508477  401  11731995  626.6
## Vcells 44056723 336.2  123591554  943 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

6.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

6.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22451
length(unique(df$to_geneID))
## [1] 20902
range(df$from_count)
## [1]   1 150
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 171
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 120
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          54247          10072           2709
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       26635     8113           2222
##   2        8151     1113            217
##   3        5577      378             94
##   4        6765      254             89
##   5        7119      214             87
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            3883            1578            2305
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                4234                2305                1038                 583 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               20146                 722                3883                 692 
##         RBH_MapMan4              reject 
##                1578               31847
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

6.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          31920           2377            884
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1        9237      838            497
##   2        4767      764            138
##   3        4396      319             80
##   4        6401      242             82
##   5        7119      214             87
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20087
length(unique(kept$to_geneID))
## [1] 20115
range(kept$from_count)
## [1]  1 51
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 5
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 11
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

6.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 118                  50                  53                  25 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 843                  42                  66                  19 
##         RBH_MapMan4              reject 
##                  21                1168
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pavi'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'prunus_avium'
  , # change name - compara # sources
  plantName3 = 'wildcherry'
  ,  # change name - MCScanX # sources
  plantName4 = 'pavi'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "pavi_wildCherry"
  , # inconsistent-IDs, orthofinder
  plantNameOut = "wildcherry"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "wildcherry")
  ,

  pattern_in = "-[^-]*$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = "^(Pav_(sc|co)\\d+\\.\\d+_g\\d+\\.\\d+\\.(br|mk)).*" # instead of ids has some strange concatenation
  ,
  compara_pattern_out1 = "\\1"
  ,
  compara_pattern_in2 = ''
  ,
  compara_pattern_out2 = ''
  ,
  plaza_pattern_in1 = ""
  ,
  plaza_pattern_in2 = ""
  ,
  
  ref_genome = "Prunus_avium_Tieton.proteins"
  , # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'pavi_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "Fun"
  , # plant-gmm
  mercatorPatternOut2 = "FUN" # plant-gmm
  ,
  flag1 = 2
  ,
  flag2 = 1
  ,
  flag3 = TRUE # compara$Ortholog contains mrna space gene 
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002741a131e38>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-112] | |….. | 9% | |…… | 12% [unnamed-chunk-113] | |…….. | 15% | |……… | 18% [unnamed-chunk-114] | |……….. | 21% | |………… | 24% [unnamed-chunk-115] | |………….. | 27% | |…………… | 30% [unnamed-chunk-116] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-117] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-118] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-119] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-120] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-121] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-122] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-123] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-124] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-125] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-126] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-127] | |…………………………………………..| 100%

cat(child_content)

7 Subsection: pavi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

7.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##            4504           48941           39725           38228           25594
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID                 to_protID            source
##    <chr>       <chr>       <chr>                     <chr>                <chr> 
##  1 <NA>        AT1G12040.1 <NA>                      FUN_000050-T1        FastO…
##  2 <NA>        AT1G62440.1 <NA>                      FUN_000050-T1        FastO…
##  3 <NA>        AT2G43840.2 <NA>                      FUN_040335-T1        FastO…
##  4 <NA>        AT2G44050.1 <NA>                      FUN_040336-T1        FastO…
##  5 <NA>        AT5G58130.1 <NA>                      FUN_000052-T1        MCSca…
##  6 <NA>        AT5G58130.1 <NA>                      FUN_000053-T1        MCSca…
##  7 <NA>        AT2G43840.2 <NA>                      FUN_040335-T1        MCSca…
##  8 <NA>        AT2G44050.1 <NA>                      FUN_040336-T1        MCSca…
##  9 AT4G39370   <NA>        FUN_020728                <NA>                 Ortho…
## 10 AT3G06350   <NA>        FUN_020749                <NA>                 Ortho…
## 11 AT4G24220   <NA>        FUN_029917                <NA>                 Ortho…
## 12 AT4G24220   <NA>        FUN_029968                <NA>                 Ortho…
## 13 AT1G01030   <NA>        FUN_025493                <NA>                 RBH   
## 14 AT1G01040   <NA>        FUN_011748                <NA>                 RBH   
## 15 ATMG01250   <NA>        FUN_040221                <NA>                 RBH   
## 16 ATMG01360   <NA>        FUN_026804                <NA>                 RBH   
## 17 AT1G01210   AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225   AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710   ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310   ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…

7.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       5
print(df[ind, ])
##        from_geneID     from_protID to_geneID     to_protID  source
##             <char>          <char>    <char>        <char>  <char>
## 1: AT4G19110.uORF1 AT4G19110.uORF1      <NA> FUN_023847-T1 MCScanX
## 2: AT5G45430.uORF1 AT5G45430.uORF1      <NA> FUN_023847-T1 MCScanX
## 3: AT5G09460.uORF1 AT5G09460.uORF1      <NA> FUN_028837-T1 MCScanX
## 4: AT5G64340.uORF1 AT5G64340.uORF1      <NA> FUN_028837-T1 MCScanX
## 5: AT1G29950.uORF2 AT1G29950.uORF2      <NA> FUN_032407-T1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_protID to_geneID                 to_protID            source
##    <chr>       <chr>       <chr>                     <chr>                <chr> 
##  1 AT1G12040   AT1G12040.1 FUN_000050                FUN_000050-T1        FastO…
##  2 AT1G62440   AT1G62440.1 FUN_000050                FUN_000050-T1        FastO…
##  3 AT2G43840   AT2G43840.2 FUN_040335                FUN_040335-T1        FastO…
##  4 AT2G44050   AT2G44050.1 FUN_040336                FUN_040336-T1        FastO…
##  5 AT5G58130   AT5G58130.1 FUN_000052                FUN_000052-T1        MCSca…
##  6 AT5G58130   AT5G58130.1 FUN_000053                FUN_000053-T1        MCSca…
##  7 AT2G43840   AT2G43840.2 FUN_040335                FUN_040335-T1        MCSca…
##  8 AT2G44050   AT2G44050.1 FUN_040336                FUN_040336-T1        MCSca…
##  9 AT4G39370   <NA>        FUN_020728                <NA>                 Ortho…
## 10 AT3G06350   <NA>        FUN_020749                <NA>                 Ortho…
## 11 AT4G24220   <NA>        FUN_029917                <NA>                 Ortho…
## 12 AT4G24220   <NA>        FUN_029968                <NA>                 Ortho…
## 13 AT1G01030   <NA>        FUN_025493                <NA>                 RBH   
## 14 AT1G01040   <NA>        FUN_011748                <NA>                 RBH   
## 15 ATMG01250   <NA>        FUN_040221                <NA>                 RBH   
## 16 ATMG01360   <NA>        FUN_026804                <NA>                 RBH   
## 17 AT1G01210   AT1G01210.3 Pav_sc0000586.1_g580.1.mk Pav_sc0000586.1_g58… ensem…
## 18 AT1G01225   AT1G01225.1 Pav_sc0000586.1_g550.1.mk Pav_sc0000586.1_g55… ensem…
## 19 ATCG00710   ATCG00710.1 Pav_sc0000216.1_g900.1.mk Pav_sc0000216.1_g90… ensem…
## 20 ATMG00310   ATMG00310.1 Pav_sc0001554.1_g020.1.br Pav_sc0001554.1_g02… ensem…
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##             source na_to_geneID na_to_protID
##             <char>        <int>        <int>
## 1: ensembl-compara            0            0
## 2:         FastOMA            0            0
## 3:         MCScanX            0            0
## 4:         OrthoDB            0        38228
## 5:             RBH            0        25594

7.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}
## # A tibble: 20 × 3
##    from_geneID to_geneID  source         
##    <chr>       <chr>      <chr>          
##  1 AT1G12040   FUN_000050 FastOMA        
##  2 AT1G62440   FUN_000050 FastOMA        
##  3 AT2G43840   FUN_040335 FastOMA        
##  4 AT2G44050   FUN_040336 FastOMA        
##  5 AT5G58130   FUN_000052 MCScanX        
##  6 AT5G58130   FUN_000053 MCScanX        
##  7 AT2G43840   FUN_040335 MCScanX        
##  8 AT2G44050   FUN_040336 MCScanX        
##  9 AT4G39370   FUN_020728 OrthoDB        
## 10 AT3G06350   FUN_020749 OrthoDB        
## 11 AT4G24220   FUN_029917 OrthoDB        
## 12 AT4G24220   FUN_029968 OrthoDB        
## 13 AT1G01030   FUN_025493 RBH            
## 14 AT1G01040   FUN_011748 RBH            
## 15 ATMG01250   FUN_040221 RBH            
## 16 ATMG01360   FUN_026804 RBH            
## 17 AT1G01210   FUN_011652 ensembl-compara
## 18 AT1G01225   FUN_011648 ensembl-compara
## 19 AT5G67620   FUN_021483 ensembl-compara
## 20 ATMG00310   FUN_030126 ensembl-compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  2606729 139.3    7508477  401  11731995  626.6
## Vcells 48847313 372.7  123591554  943 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

7.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22172
length(unique(dt$to_geneID))
## [1] 21950
table(dt$source)
## 
## ensembl-compara         FastOMA         MCScanX         OrthoDB             RBH 
##            4367           45924           19709           38228           25594
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

7.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

7.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID  to_geneID FastOMA MCScanX OrthoDB    RBH ensembl-compara
##             <char>     <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>          <lgcl>
## 1: AT1G29950.uORF2 FUN_032407   FALSE    TRUE   FALSE  FALSE           FALSE
## 2: AT4G19110.uORF1 FUN_023847   FALSE    TRUE   FALSE  FALSE           FALSE
## 3: AT5G09460.uORF1 FUN_028837   FALSE    TRUE   FALSE  FALSE           FALSE
## 4: AT5G45430.uORF1 FUN_023847   FALSE    TRUE   FALSE  FALSE           FALSE
## 5: AT5G64340.uORF1 FUN_028837   FALSE    TRUE   FALSE  FALSE           FALSE
##    count_evidence
##             <num>
## 1:              1
## 2:              1
## 3:              1
## 4:              1
## 5:              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

7.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

7.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

7.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 66 75 6e 5f 30 31 33 33 39 34 2d 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 66 75 6e 5f 30 31 33 33 39 34 2d 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 16470 23868
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "ensembl-compara" "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "athName"         "athSynonims"     "all_pathways"   
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 76691     2
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3216797 171.8    7508477 401.0  11731995  626.6
## Vcells 45502821 347.2  123635155 943.3 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

7.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

7.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22167
length(unique(df$to_geneID))
## [1] 21948
range(df$from_count)
## [1]   1 122
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 242
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 131
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          57035          12115           2872
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       28882    10077           2401
##   2        9643     1295            248
##   3        8304      454            119
##   4        8447      244             84
##   5        1759       45             20
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            5081            1779            3127
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                1319                3127                2137                 893 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               19814                3065                5081                1317 
##         RBH_MapMan4              reject 
##                1779               33490
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

7.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          34942           2583           1007
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       11045      883            634
##   2        6524     1002            163
##   3        7269      411            106
##   4        8345      242             84
##   5        1759       45             20
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19864
length(unique(kept$to_geneID))
## [1] 20842
range(kept$from_count)
## [1]  1 59
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 11
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 15
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

7.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     ensembl-compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                  49                 113                  79                  36 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 795                  84                 109                  61 
##         RBH_MapMan4              reject 
##                  28                1414
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'parm'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'parm'
  , # change name - compara # sources
  plantName3 = '^apricot$'
  ,  # change name - MCScanX # sources
  plantName4 = 'parm'
  ,  # change name - FastOMA # sources
  
  plantDirIn = "parm_apricot"
  , # inconsistent-IDs, orthofinder
  plantNameOut = "apricot"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "apricot")
  ,

  pattern_in = "(\\.[^.]+){2}$" # second dot from the end of the string
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  compara_pattern_in1 = "" # nothin here # improve by flag
  ,
  compara_pattern_out1 = "" # nothin here
  ,
  compara_pattern_in2 = '' # nothin here
  ,
  compara_pattern_out2 = '' # nothin here
  ,
  plaza_pattern_in1 = "" # nothin here
  ,
  plaza_pattern_in2 = "" # nothin here
  ,
  
  ref_genome = "Prunus_armeniaca_Marouch_n14_peptide"
  , # inconsistent-IDs, orthofinder for OrthoDB
  
  mercator = 'parm_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "([mg])"
  , # plant-gmm
  mercatorPatternOut2 = "\\U\\1" # plant-gmm
  ,
  flag1 = 3
  ,
  flag2 = 3
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000273f83332a0>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-146] | |….. | 9% | |…… | 12% [unnamed-chunk-147] | |…….. | 15% | |……… | 18% [unnamed-chunk-148] | |……….. | 21% | |………… | 24% [unnamed-chunk-149] | |………….. | 27% | |…………… | 30% [unnamed-chunk-150] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-151] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-152] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-153] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-154] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-155] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-156] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-157] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-158] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-159] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-160] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-161] | |…………………………………………..| 100%

cat(child_content)

8 Subsection: parm

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

8.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   45941   37435   52084   25259
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_protID to_geneID       to_protID               source 
##    <chr>       <chr>       <chr>           <chr>                   <chr>  
##  1 <NA>        AT1G12040.1 <NA>            PruarM.1G000500.t1.p1_1 FastOMA
##  2 <NA>        AT1G62440.1 <NA>            PruarM.1G000500.t1.p1_1 FastOMA
##  3 <NA>        AT2G47410.2 <NA>            PruarM.8G368500.t1.p1_1 FastOMA
##  4 <NA>        AT4G02060.2 <NA>            PruarM.8G368700.t1.p1_1 FastOMA
##  5 <NA>        AT5G58130.1 <NA>            PruarM.1G001200.t1.p1   MCScanX
##  6 <NA>        AT5G58110.1 <NA>            PruarM.1G001800.t1.p1   MCScanX
##  7 <NA>        AT4G02060.1 <NA>            PruarM.8G368700.t1.p1   MCScanX
##  8 <NA>        AT4G02060.2 <NA>            PruarM.8G368700.t1.p1   MCScanX
##  9 AT5G10270   <NA>        PruarM.1G279700 <NA>                    OrthoDB
## 10 AT5G64960   <NA>        PruarM.1G279700 <NA>                    OrthoDB
## 11 AT2G15790   <NA>        PruarM.8G195500 <NA>                    OrthoDB
## 12 AT4G34660   <NA>        PruarM.8G163400 <NA>                    OrthoDB
## 13 AT1G01010   <NA>        PruarM.2G368400 <NA>                    RBH    
## 14 AT1G01030   <NA>        PruarM.5G193300 <NA>                    RBH    
## 15 ATMG01360   <NA>        PruarM.4G189900 <NA>                    RBH    
## 16 ATMG01360   <NA>        PruarM.4G190100 <NA>                    RBH

8.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       6
print(df[ind, ])
##        from_geneID     from_protID to_geneID             to_protID  source
##             <char>          <char>    <char>                <char>  <char>
## 1: AT3G25570.uORF1 AT3G25570.uORF1      <NA> PruarM.1G494600.t1.p1 MCScanX
## 2: AT1G25470.uORF1 AT1G25470.uORF1      <NA> PruarM.1G507600.t1.p1 MCScanX
## 3: AT1G68550.uORF1 AT1G68550.uORF1      <NA> PruarM.1G507600.t1.p1 MCScanX
## 4: AT1G29950.uORF2 AT1G29950.uORF2      <NA> PruarM.4G082900.t1.p1 MCScanX
## 5: AT2G27230.uORF1 AT2G27230.uORF1      <NA> PruarM.6G181100.t1.p1 MCScanX
## 6: AT3G12010.uORF1 AT3G12010.uORF1      <NA> PruarM.7G184500.t1.p1 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_protID to_geneID       to_protID               source 
##    <chr>       <chr>       <chr>           <chr>                   <chr>  
##  1 AT1G12040   AT1G12040.1 PruarM.1G000500 PruarM.1G000500.t1.p1_1 FastOMA
##  2 AT1G62440   AT1G62440.1 PruarM.1G000500 PruarM.1G000500.t1.p1_1 FastOMA
##  3 AT2G47410   AT2G47410.2 PruarM.8G368500 PruarM.8G368500.t1.p1_1 FastOMA
##  4 AT4G02060   AT4G02060.2 PruarM.8G368700 PruarM.8G368700.t1.p1_1 FastOMA
##  5 AT5G58130   AT5G58130.1 PruarM.1G001200 PruarM.1G001200.t1.p1   MCScanX
##  6 AT5G58110   AT5G58110.1 PruarM.1G001800 PruarM.1G001800.t1.p1   MCScanX
##  7 AT4G02060   AT4G02060.1 PruarM.8G368700 PruarM.8G368700.t1.p1   MCScanX
##  8 AT4G02060   AT4G02060.2 PruarM.8G368700 PruarM.8G368700.t1.p1   MCScanX
##  9 AT5G10270   <NA>        PruarM.1G279700 <NA>                    OrthoDB
## 10 AT5G64960   <NA>        PruarM.1G279700 <NA>                    OrthoDB
## 11 AT2G15790   <NA>        PruarM.8G195500 <NA>                    OrthoDB
## 12 AT4G34660   <NA>        PruarM.8G163400 <NA>                    OrthoDB
## 13 AT1G01010   <NA>        PruarM.2G368400 <NA>                    RBH    
## 14 AT1G01030   <NA>        PruarM.5G193300 <NA>                    RBH    
## 15 ATMG01360   <NA>        PruarM.4G189900 <NA>                    RBH    
## 16 ATMG01360   <NA>        PruarM.4G190100 <NA>                    RBH
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##     source na_to_geneID na_to_protID
##     <char>        <int>        <int>
## 1: FastOMA            0            0
## 2: MCScanX            0            0
## 3: OrthoDB            0        52084
## 4:     RBH            0        25259

8.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}
## # A tibble: 16 × 3
##    from_geneID to_geneID       source 
##    <chr>       <chr>           <chr>  
##  1 AT1G12040   PruarM.1G000500 FastOMA
##  2 AT1G62440   PruarM.1G000500 FastOMA
##  3 AT2G47410   PruarM.8G368500 FastOMA
##  4 AT4G02060   PruarM.8G368700 FastOMA
##  5 AT5G58130   PruarM.1G001200 MCScanX
##  6 AT5G58110   PruarM.1G001800 MCScanX
##  7 AT2G47410   PruarM.8G368500 MCScanX
##  8 AT4G02060   PruarM.8G368700 MCScanX
##  9 AT5G10270   PruarM.1G279700 OrthoDB
## 10 AT5G64960   PruarM.1G279700 OrthoDB
## 11 AT2G15790   PruarM.8G195500 OrthoDB
## 12 AT4G34660   PruarM.8G163400 OrthoDB
## 13 AT1G01010   PruarM.2G368400 RBH    
## 14 AT1G01030   PruarM.5G193300 RBH    
## 15 ATMG01360   PruarM.4G189900 RBH    
## 16 ATMG01360   PruarM.4G190100 RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2669622 142.6    7508477 401.0  11731995  626.6
## Vcells 49267998 375.9  123635155 943.3 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

8.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22357
length(unique(dt$to_geneID))
## [1] 22551
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   43038   18616   52084   25259
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

8.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

8.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID       to_geneID FastOMA MCScanX OrthoDB    RBH
##             <char>          <char>  <lgcl>  <lgcl>  <lgcl> <lgcl>
## 1: AT1G25470.uORF1 PruarM.1G507600   FALSE    TRUE   FALSE  FALSE
## 2: AT1G29950.uORF2 PruarM.4G082900   FALSE    TRUE   FALSE  FALSE
## 3: AT1G68550.uORF1 PruarM.1G507600   FALSE    TRUE   FALSE  FALSE
## 4: AT2G27230.uORF1 PruarM.6G181100   FALSE    TRUE   FALSE  FALSE
## 5: AT3G12010.uORF1 PruarM.7G184500   FALSE    TRUE   FALSE  FALSE
## 6: AT3G25570.uORF1 PruarM.1G494600   FALSE    TRUE   FALSE  FALSE
##    count_evidence
##             <num>
## 1:              1
## 2:              1
## 3:              1
## 4:              1
## 5:              1
## 6:              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

8.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

8.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

8.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 72 75 61 72 6d 2e 33 67 30 30 35 33 30 30 2e 74 31 2e 70 31 5f 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 72 75 61 72 6d 2e 33 67 30 30 35 33 30 30 2e 74 31 2e 70 31 5f 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 15496 24571
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "athName"         "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 90420
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2560974 136.8    7508477 401.0  11731995  626.6
## Vcells 33609612 256.5   98908124 754.7 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

8.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

8.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22351
length(unique(df$to_geneID))
## [1] 22547
range(df$from_count)
## [1]   1 267
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 344
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 392
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          72560          14582           3205
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       43938    12426           2648
##   2       10208     1389            288
##   3        9348      461            163
##   4        9066      306            106
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##           18771            2310            2937
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                2937                2501                 921               20367 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                4043               18771                1491                2310 
##              reject 
##               37006
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

8.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          49594           2646           1101
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       25239      837            648
##   2        6966     1089            196
##   3        8323      414            151
##   4        9066      306            106
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19817
length(unique(kept$to_geneID))
## [1] 20636
range(kept$from_count)
## [1]   1 270
range(kept$to_count)
## [1]   1 129
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 78
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 279
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

8.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                 114                 110                  23                 776 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                 124                 129                  56                  29 
##              reject 
##                1540
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pcox'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'pcox'
  , # change name - compara # sources
  plantName3 = '^pear$'
  ,  # change name - MCScanX # sources
  plantName4 = 'pcox'
  ,  # change name - FastOMA # sources
  
  plantNameOut = "pear"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "pear")
  ,

  pattern_in = "(\\.[^.]+)$" # last dot from the end of the string
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs
  
  mercator = 'pcox_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "(\\.chr\\d+)a(\\.)"
  , # plant-gmm
  mercatorPatternOut2 = "\\1A\\2"
  ,
  flag1 = 4
  ,
  flag2 = 4
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000274714b6270>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-180] | |….. | 9% | |…… | 12% [unnamed-chunk-181] | |…….. | 15% | |……… | 18% [unnamed-chunk-182] | |……….. | 21% | |………… | 24% [unnamed-chunk-183] | |………….. | 27% | |…………… | 30% [unnamed-chunk-184] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-185] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-186] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-187] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-188] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-189] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-190] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-191] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-192] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-193] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-194] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-195] | |…………………………………………..| 100%

cat(child_content)

9 Subsection: pcox

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

9.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##   75988   61908   36090
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID                   to_protID          source
##    <chr>       <chr>       <chr>                       <chr>              <chr> 
##  1 <NA>        AT1G20520.1 <NA>                        Pyrco.da.v2a1.aug… FastO…
##  2 <NA>        AT1G76210.1 <NA>                        Pyrco.da.v2a1.aug… FastO…
##  3 <NA>        AT5G53090.1 <NA>                        Pyrco.da.v2a1.sna… FastO…
##  4 <NA>        AT5G53100.1 <NA>                        Pyrco.da.v2a1.sna… FastO…
##  5 <NA>        AT1G20520.1 <NA>                        Pyrco.da.v2a1.aug… MCSca…
##  6 <NA>        AT1G76210.1 <NA>                        Pyrco.da.v2a1.aug… MCSca…
##  7 <NA>        AT3G17760.2 <NA>                        Pyrco.da.v2a1.sna… MCSca…
##  8 <NA>        AT3G17765.1 <NA>                        Pyrco.da.v2a1.sna… MCSca…
##  9 AT1G01030   <NA>        Pyrco.da.v2a1.chr14A.371380 <NA>               RBH   
## 10 AT1G01030   <NA>        Pyrco.da.v2a1.chr1A.345960  <NA>               RBH   
## 11 ATMG01250   <NA>        Pyrco.da.v2a1.chr5A.045340  <NA>               RBH   
## 12 ATMG01250   <NA>        Pyrco.da.v2a1.snap.153710   <NA>               RBH

9.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##      39
print(df[ind, ])
##         from_geneID     from_protID to_geneID                        to_protID
##              <char>          <char>    <char>                           <char>
##  1: AT3G51630.uORF1 AT3G51630.uORF1      <NA> Pyrco.da.v2a1.augustus.329290.t1
##  2: AT1G06150.uORF1 AT1G06150.uORF1      <NA>   Pyrco.da.v2a1.chr10A.089460.t1
##  3: AT2G31280.uORF1 AT2G31280.uORF1      <NA>   Pyrco.da.v2a1.chr10A.089460.t1
##  4: AT1G30270.uORF1 AT1G30270.uORF1      <NA>   Pyrco.da.v2a1.chr10A.095270.t1
##  5: AT1G29950.uORF1 AT1G29950.uORF1      <NA>   Pyrco.da.v2a1.chr10A.095790.t1
##  6: AT2G27230.uORF1 AT2G27230.uORF1      <NA>   Pyrco.da.v2a1.chr11A.116810.t1
##  7: AT3G12010.uORF1 AT3G12010.uORF1      <NA>   Pyrco.da.v2a1.chr12A.316850.t1
##  8: AT1G23150.uORF1 AT1G23150.uORF1      <NA>   Pyrco.da.v2a1.chr13A.237820.t1
##  9: AT1G70780.uORF1 AT1G70780.uORF1      <NA>   Pyrco.da.v2a1.chr13A.237820.t1
## 10: AT1G25470.uORF1 AT1G25470.uORF1      <NA>   Pyrco.da.v2a1.chr13A.239350.t1
## 11: AT1G68550.uORF1 AT1G68550.uORF1      <NA>   Pyrco.da.v2a1.chr13A.239350.t1
## 12: AT3G25570.uORF1 AT3G25570.uORF1      <NA>   Pyrco.da.v2a1.chr13A.240180.t1
## 13: AT3G12010.uORF1 AT3G12010.uORF1      <NA>   Pyrco.da.v2a1.chr14A.362550.t1
## 14: AT1G64140.uORF1 AT1G64140.uORF1      <NA>   Pyrco.da.v2a1.chr14A.369240.t1
## 15: AT4G36990.uORF1 AT4G36990.uORF1      <NA>   Pyrco.da.v2a1.chr15A.024510.t1
## 16: AT1G23150.uORF1 AT1G23150.uORF1      <NA>   Pyrco.da.v2a1.chr16A.185840.t1
## 17: AT1G70780.uORF1 AT1G70780.uORF1      <NA>   Pyrco.da.v2a1.chr16A.185840.t1
## 18: AT3G25570.uORF1 AT3G25570.uORF1      <NA>   Pyrco.da.v2a1.chr16A.188440.t1
## 19: AT4G19110.uORF1 AT4G19110.uORF1      <NA>   Pyrco.da.v2a1.chr16A.208550.t1
## 20: AT3G01470.uORF1 AT3G01470.uORF1      <NA>   Pyrco.da.v2a1.chr17A.289810.t1
## 21: AT3G02470.uORF1 AT3G02470.uORF1      <NA>   Pyrco.da.v2a1.chr17A.293310.t1
## 22: AT5G15950.uORF1 AT5G15950.uORF1      <NA>   Pyrco.da.v2a1.chr17A.293310.t1
## 23: AT4G25670.uORF1 AT4G25670.uORF1      <NA>    Pyrco.da.v2a1.chr1A.355460.t1
## 24: AT4G25690.uORF1 AT4G25690.uORF1      <NA>    Pyrco.da.v2a1.chr1A.355460.t1
## 25: AT5G52550.uORF1 AT5G52550.uORF1      <NA>    Pyrco.da.v2a1.chr1A.355460.t1
## 26: AT4G36990.uORF1 AT4G36990.uORF1      <NA>    Pyrco.da.v2a1.chr2A.143870.t1
## 27: AT5G60450.uORF1 AT5G60450.uORF1      <NA>    Pyrco.da.v2a1.chr3A.270500.t1
## 28: AT3G53400.uORF1 AT3G53400.uORF1      <NA>    Pyrco.da.v2a1.chr4A.411680.t1
## 29: AT5G03190.uORF1 AT5G03190.uORF1      <NA>    Pyrco.da.v2a1.chr4A.411680.t1
## 30: AT3G51630.uORF1 AT3G51630.uORF1      <NA>    Pyrco.da.v2a1.chr4A.417460.t1
## 31: AT1G06150.uORF1 AT1G06150.uORF1      <NA>    Pyrco.da.v2a1.chr5A.058630.t1
## 32: AT2G31280.uORF1 AT2G31280.uORF1      <NA>    Pyrco.da.v2a1.chr5A.058630.t1
## 33: AT4G19110.uORF1 AT4G19110.uORF1      <NA>    Pyrco.da.v2a1.chr6A.425040.t1
## 34: AT1G64140.uORF1 AT1G64140.uORF1      <NA>    Pyrco.da.v2a1.chr6A.433460.t1
## 35: AT4G25670.uORF1 AT4G25670.uORF1      <NA>    Pyrco.da.v2a1.chr7A.180790.t1
## 36: AT5G52550.uORF1 AT5G52550.uORF1      <NA>    Pyrco.da.v2a1.chr7A.180790.t1
## 37: AT3G01470.uORF1 AT3G01470.uORF1      <NA>    Pyrco.da.v2a1.chr9A.212340.t1
## 38: AT1G30270.uORF1 AT1G30270.uORF1      <NA>     Pyrco.da.v2a1.snap.064820.t1
## 39: AT1G48600.uORF1 AT1G48600.uORF1      <NA>     Pyrco.da.v2a1.snap.379480.t1
##         from_geneID     from_protID to_geneID                        to_protID
##      source
##      <char>
##  1: MCScanX
##  2: MCScanX
##  3: MCScanX
##  4: MCScanX
##  5: MCScanX
##  6: MCScanX
##  7: MCScanX
##  8: MCScanX
##  9: MCScanX
## 10: MCScanX
## 11: MCScanX
## 12: MCScanX
## 13: MCScanX
## 14: MCScanX
## 15: MCScanX
## 16: MCScanX
## 17: MCScanX
## 18: MCScanX
## 19: MCScanX
## 20: MCScanX
## 21: MCScanX
## 22: MCScanX
## 23: MCScanX
## 24: MCScanX
## 25: MCScanX
## 26: MCScanX
## 27: MCScanX
## 28: MCScanX
## 29: MCScanX
## 30: MCScanX
## 31: MCScanX
## 32: MCScanX
## 33: MCScanX
## 34: MCScanX
## 35: MCScanX
## 36: MCScanX
## 37: MCScanX
## 38: MCScanX
## 39: MCScanX
##      source
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID                     to_protID        source
##    <chr>       <chr>       <chr>                         <chr>            <chr> 
##  1 AT1G20520   AT1G20520.1 Pyrco.da.v2a1.augustus.000230 Pyrco.da.v2a1.a… FastO…
##  2 AT1G76210   AT1G76210.1 Pyrco.da.v2a1.augustus.000230 Pyrco.da.v2a1.a… FastO…
##  3 AT5G53090   AT5G53090.1 Pyrco.da.v2a1.snap.445350     Pyrco.da.v2a1.s… FastO…
##  4 AT5G53100   AT5G53100.1 Pyrco.da.v2a1.snap.445350     Pyrco.da.v2a1.s… FastO…
##  5 AT1G20520   AT1G20520.1 Pyrco.da.v2a1.augustus.000230 Pyrco.da.v2a1.a… MCSca…
##  6 AT1G76210   AT1G76210.1 Pyrco.da.v2a1.augustus.000230 Pyrco.da.v2a1.a… MCSca…
##  7 AT3G17760   AT3G17760.2 Pyrco.da.v2a1.snap.445090     Pyrco.da.v2a1.s… MCSca…
##  8 AT3G17765   AT3G17765.1 Pyrco.da.v2a1.snap.445090     Pyrco.da.v2a1.s… MCSca…
##  9 AT1G01030   <NA>        Pyrco.da.v2a1.chr14A.371380   <NA>             RBH   
## 10 AT1G01030   <NA>        Pyrco.da.v2a1.chr1A.345960    <NA>             RBH   
## 11 ATMG01250   <NA>        Pyrco.da.v2a1.chr5A.045340    <NA>             RBH   
## 12 ATMG01250   <NA>        Pyrco.da.v2a1.snap.153710     <NA>             RBH
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##     source na_to_geneID na_to_protID
##     <char>        <int>        <int>
## 1: FastOMA            0            0
## 2: MCScanX            0            0
## 3:     RBH            0        36090

9.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}

df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2339005 125.0    7508477 401.0  11731995  626.6
## Vcells 39244998 299.5   98908124 754.7 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

9.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21628
length(unique(dt$to_geneID))
## [1] 30838
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   75969   34022   36090
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

9.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

9.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##         from_geneID                     to_geneID FastOMA MCScanX    RBH
##              <char>                        <char>  <lgcl>  <lgcl> <lgcl>
##  1: AT1G06150.uORF1   Pyrco.da.v2a1.chr10A.089460   FALSE    TRUE  FALSE
##  2: AT1G06150.uORF1    Pyrco.da.v2a1.chr5A.058630   FALSE    TRUE  FALSE
##  3: AT1G23150.uORF1   Pyrco.da.v2a1.chr13A.237820   FALSE    TRUE  FALSE
##  4: AT1G23150.uORF1   Pyrco.da.v2a1.chr16A.185840   FALSE    TRUE  FALSE
##  5: AT1G25470.uORF1   Pyrco.da.v2a1.chr13A.239350   FALSE    TRUE  FALSE
##  6: AT1G29950.uORF1   Pyrco.da.v2a1.chr10A.095790   FALSE    TRUE  FALSE
##  7: AT1G30270.uORF1   Pyrco.da.v2a1.chr10A.095270   FALSE    TRUE  FALSE
##  8: AT1G30270.uORF1     Pyrco.da.v2a1.snap.064820   FALSE    TRUE  FALSE
##  9: AT1G48600.uORF1     Pyrco.da.v2a1.snap.379480   FALSE    TRUE  FALSE
## 10: AT1G64140.uORF1   Pyrco.da.v2a1.chr14A.369240   FALSE    TRUE  FALSE
## 11: AT1G64140.uORF1    Pyrco.da.v2a1.chr6A.433460   FALSE    TRUE  FALSE
## 12: AT1G68550.uORF1   Pyrco.da.v2a1.chr13A.239350   FALSE    TRUE  FALSE
## 13: AT1G70780.uORF1   Pyrco.da.v2a1.chr13A.237820   FALSE    TRUE  FALSE
## 14: AT1G70780.uORF1   Pyrco.da.v2a1.chr16A.185840   FALSE    TRUE  FALSE
## 15: AT2G27230.uORF1   Pyrco.da.v2a1.chr11A.116810   FALSE    TRUE  FALSE
## 16: AT2G31280.uORF1   Pyrco.da.v2a1.chr10A.089460   FALSE    TRUE  FALSE
## 17: AT2G31280.uORF1    Pyrco.da.v2a1.chr5A.058630   FALSE    TRUE  FALSE
## 18: AT3G01470.uORF1   Pyrco.da.v2a1.chr17A.289810   FALSE    TRUE  FALSE
## 19: AT3G01470.uORF1    Pyrco.da.v2a1.chr9A.212340   FALSE    TRUE  FALSE
## 20: AT3G02470.uORF1   Pyrco.da.v2a1.chr17A.293310   FALSE    TRUE  FALSE
## 21: AT3G12010.uORF1   Pyrco.da.v2a1.chr12A.316850   FALSE    TRUE  FALSE
## 22: AT3G12010.uORF1   Pyrco.da.v2a1.chr14A.362550   FALSE    TRUE  FALSE
## 23: AT3G25570.uORF1   Pyrco.da.v2a1.chr13A.240180   FALSE    TRUE  FALSE
## 24: AT3G25570.uORF1   Pyrco.da.v2a1.chr16A.188440   FALSE    TRUE  FALSE
## 25: AT3G51630.uORF1 Pyrco.da.v2a1.augustus.329290   FALSE    TRUE  FALSE
## 26: AT3G51630.uORF1    Pyrco.da.v2a1.chr4A.417460   FALSE    TRUE  FALSE
## 27: AT3G53400.uORF1    Pyrco.da.v2a1.chr4A.411680   FALSE    TRUE  FALSE
## 28: AT4G19110.uORF1   Pyrco.da.v2a1.chr16A.208550   FALSE    TRUE  FALSE
## 29: AT4G19110.uORF1    Pyrco.da.v2a1.chr6A.425040   FALSE    TRUE  FALSE
## 30: AT4G25670.uORF1    Pyrco.da.v2a1.chr1A.355460   FALSE    TRUE  FALSE
## 31: AT4G25670.uORF1    Pyrco.da.v2a1.chr7A.180790   FALSE    TRUE  FALSE
## 32: AT4G25690.uORF1    Pyrco.da.v2a1.chr1A.355460   FALSE    TRUE  FALSE
## 33: AT4G36990.uORF1   Pyrco.da.v2a1.chr15A.024510   FALSE    TRUE  FALSE
## 34: AT4G36990.uORF1    Pyrco.da.v2a1.chr2A.143870   FALSE    TRUE  FALSE
## 35: AT5G03190.uORF1    Pyrco.da.v2a1.chr4A.411680   FALSE    TRUE  FALSE
## 36: AT5G15950.uORF1   Pyrco.da.v2a1.chr17A.293310   FALSE    TRUE  FALSE
## 37: AT5G52550.uORF1    Pyrco.da.v2a1.chr1A.355460   FALSE    TRUE  FALSE
## 38: AT5G52550.uORF1    Pyrco.da.v2a1.chr7A.180790   FALSE    TRUE  FALSE
## 39: AT5G60450.uORF1    Pyrco.da.v2a1.chr3A.270500   FALSE    TRUE  FALSE
##         from_geneID                     to_geneID FastOMA MCScanX    RBH
##     count_evidence
##              <num>
##  1:              1
##  2:              1
##  3:              1
##  4:              1
##  5:              1
##  6:              1
##  7:              1
##  8:              1
##  9:              1
## 10:              1
## 11:              1
## 12:              1
## 13:              1
## 14:              1
## 15:              1
## 16:              1
## 17:              1
## 18:              1
## 19:              1
## 20:              1
## 21:              1
## 22:              1
## 23:              1
## 24:              1
## 25:              1
## 26:              1
## 27:              1
## 28:              1
## 29:              1
## 30:              1
## 31:              1
## 32:              1
## 33:              1
## 34:              1
## 35:              1
## 36:              1
## 37:              1
## 38:              1
## 39:              1
##     count_evidence
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

9.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

9.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

9.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 79 72 63 6f 2e 64 61 2e 76 32 61 31 2e 63 68 72 33 61 2e 32 38 30 35
## [26] 32 30 2e 74 31 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 79 72 63 6f 2e 64 61 2e 76 32 61 31 2e 63 68 72 33 61 2e 32 38 30 35 32
## [26] 30 2e 74 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 13984 30855
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 95920
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2212831 118.2    7508477 401.0  11731995  626.6
## Vcells 27621262 210.8   79126500 603.7 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

9.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

9.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 21603
length(unique(df$to_geneID))
## [1] 30838
range(df$from_count)
## [1]   1 136
range(df$to_count)
## [1]   1 116
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 261
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 287
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          76695          14597           4628
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       44178    13082           4245
##   2       17197     1215            245
##   3       15320      300            138
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##            4284            8269
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##            8269            5614           34000            4284           43753
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

9.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          46868           4031           1268
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       16448     2607            934
##   2       15100     1124            196
##   3       15320      300            138
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 19961
length(unique(kept$to_geneID))
## [1] 29935
range(kept$from_count)
## [1]  1 44
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 11
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 37
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

9.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             309             175            1459             118            1243
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'pcer'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'pcer'
  , # change name - compara # sources
  plantName3 = '^cherryplum$'
  ,  # change name - MCScanX # sources
  plantName4 = 'pcer'
  ,  # change name - FastOMA # sources
  
  plantNameOut = "cherryplum"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "cherryplum")
  ,

  pattern_in = ""
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs

  mercator = 'pcer_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "-(ra)"
  , # plant-gmm
  mercatorPatternOut2 = "-\\U\\1"
  ,
  flag1 = 4
  ,
  flag2 = 4
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002741a9160e8>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-214] | |….. | 9% | |…… | 12% [unnamed-chunk-215] | |…….. | 15% | |……… | 18% [unnamed-chunk-216] | |……….. | 21% | |………… | 24% [unnamed-chunk-217] | |………….. | 27% | |…………… | 30% [unnamed-chunk-218] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-219] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-220] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-221] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-222] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-223] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-224] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-225] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-226] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-227] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-228] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-229] | |…………………………………………..| 100%

cat(child_content)

10 Subsection: pcer

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

10.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##  162100  114733   80487
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID      to_protID      source 
##    <chr>       <chr>       <chr>          <chr>          <chr>  
##  1 <NA>        AT2G32760.1 <NA>           Pcer_000001-RA FastOMA
##  2 <NA>        AT1G07920.1 <NA>           Pcer_000002-RA FastOMA
##  3 <NA>        AT1G16650.1 <NA>           Pcer_097557-RA FastOMA
##  4 <NA>        AT1G53530.1 <NA>           Pcer_097558-RA FastOMA
##  5 <NA>        AT3G22380.1 <NA>           Pcer_000009-RA MCScanX
##  6 <NA>        AT3G22380.2 <NA>           Pcer_000009-RA MCScanX
##  7 <NA>        AT1G53530.1 <NA>           Pcer_097558-RA MCScanX
##  8 <NA>        AT1G53530.2 <NA>           Pcer_097558-RA MCScanX
##  9 AT1G01030   <NA>        Pcer_027461-RA <NA>           RBH    
## 10 AT1G01030   <NA>        Pcer_038773-RA <NA>           RBH    
## 11 ATMG01330   <NA>        Pcer_091451-RA <NA>           RBH    
## 12 ATMG01360   <NA>        Pcer_096779-RA <NA>           RBH

10.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       8
print(df[ind, ])
##        from_geneID     from_protID to_geneID      to_protID  source
##             <char>          <char>    <char>         <char>  <char>
## 1: AT3G25570.uORF1 AT3G25570.uORF1      <NA> Pcer_008015-RA MCScanX
## 2: AT3G25570.uORF1 AT3G25570.uORF1      <NA> Pcer_013327-RA MCScanX
## 3: AT1G68550.uORF1 AT1G68550.uORF1      <NA> Pcer_013420-RA MCScanX
## 4: AT2G27230.uORF1 AT2G27230.uORF1      <NA> Pcer_017258-RA MCScanX
## 5: AT3G53400.uORF1 AT3G53400.uORF1      <NA> Pcer_018173-RA MCScanX
## 6: AT5G03190.uORF1 AT5G03190.uORF1      <NA> Pcer_018173-RA MCScanX
## 7: AT3G02470.uORF1 AT3G02470.uORF1      <NA> Pcer_094386-RA MCScanX
## 8: AT5G15950.uORF1 AT5G15950.uORF1      <NA> Pcer_094386-RA MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID      to_protID      source 
##    <chr>       <chr>       <chr>          <chr>          <chr>  
##  1 AT2G32760   AT2G32760.1 Pcer_000001-RA Pcer_000001-RA FastOMA
##  2 AT1G07920   AT1G07920.1 Pcer_000002-RA Pcer_000002-RA FastOMA
##  3 AT1G16650   AT1G16650.1 Pcer_097557-RA Pcer_097557-RA FastOMA
##  4 AT1G53530   AT1G53530.1 Pcer_097558-RA Pcer_097558-RA FastOMA
##  5 AT3G22380   AT3G22380.1 Pcer_000009-RA Pcer_000009-RA MCScanX
##  6 AT3G22380   AT3G22380.2 Pcer_000009-RA Pcer_000009-RA MCScanX
##  7 AT1G53530   AT1G53530.1 Pcer_097558-RA Pcer_097558-RA MCScanX
##  8 AT1G53530   AT1G53530.2 Pcer_097558-RA Pcer_097558-RA MCScanX
##  9 AT1G01030   <NA>        Pcer_027461-RA <NA>           RBH    
## 10 AT1G01030   <NA>        Pcer_038773-RA <NA>           RBH    
## 11 ATMG01330   <NA>        Pcer_091451-RA <NA>           RBH    
## 12 ATMG01360   <NA>        Pcer_096779-RA <NA>           RBH
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##     source na_to_geneID na_to_protID
##     <char>        <int>        <int>
## 1: FastOMA            0            0
## 2: MCScanX            0            0
## 3:     RBH            0        80487

10.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}

df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2105508 112.5    6006782 320.8  11731995  626.6
## Vcells 30180394 230.3   63775074 486.6 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

10.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22204
length(unique(dt$to_geneID))
## [1] 71437
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##  162100   63364   80487
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

10.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

10.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID      to_geneID FastOMA MCScanX    RBH count_evidence
##             <char>         <char>  <lgcl>  <lgcl> <lgcl>          <num>
## 1: AT1G68550.uORF1 Pcer_013420-RA   FALSE    TRUE  FALSE              1
## 2: AT2G27230.uORF1 Pcer_017258-RA   FALSE    TRUE  FALSE              1
## 3: AT3G02470.uORF1 Pcer_094386-RA   FALSE    TRUE  FALSE              1
## 4: AT3G25570.uORF1 Pcer_008015-RA   FALSE    TRUE  FALSE              1
## 5: AT3G25570.uORF1 Pcer_013327-RA   FALSE    TRUE  FALSE              1
## 6: AT3G53400.uORF1 Pcer_018173-RA   FALSE    TRUE  FALSE              1
## 7: AT5G03190.uORF1 Pcer_018173-RA   FALSE    TRUE  FALSE              1
## 8: AT5G15950.uORF1 Pcer_094386-RA   FALSE    TRUE  FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

10.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

10.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

10.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 63 65 72 5f 30 32 33 36 33 35 2d 72 61 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 63 65 72 5f 30 32 33 36 33 35 2d 72 61
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 25951 71434
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 201291      3
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2810387 150.1    6006782 320.8  11731995  626.6
## Vcells 45079410 344.0   76618233 584.6 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

10.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

10.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22197
length(unique(df$to_geneID))
## [1] 71436
range(df$from_count)
## [1]   1 270
range(df$to_count)
## [1]   1 113
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 890
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 449
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##         166689          26214           8391
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       96970    22914           7476
##   2       39936     2642            641
##   3       29783      658            274
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##           11249           24512
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##           24512           18052           63356           11249           84125
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

10.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##         108456           5577           3136
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       43023     2473           2302
##   2       35650     2446            560
##   3       29783      658            274
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20899
length(unique(kept$to_geneID))
## [1] 68524
range(kept$from_count)
## [1]   1 258
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 255
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 78
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

10.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             891             583            2541             305            3147
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  
  plantName1 = 'psib'
  , # change name - PLAZA, OrthoDB, RBH
  plantName2 = 'psib'
  , # change name - compara # sources
  plantName3 = '^siberianapricot$'
  ,  # change name - MCScanX # sources
  plantName4 = 'psib'
  ,  # change name - FastOMA # sources
  
  plantNameOut = "siberianapricot"
  ,
  plantDirOut = file.path('..', 'reports', 'fruitTrees', "siberianapricot")
  ,

  pattern_in = "(\\.[^.]+){2}$"
  , # everythin after the last dot
  pattern_out = ""
  , # all-IDs

  mercator = 'psib_Mercator4v7_results.txt'
  , # plant-gmm
  mercatorPatternIn1 = "[\u2018\u2019\u201C\u201D']"
  , # plant-gmm, generic removal of nonsence
  mercatorPatternOut1 = ""
  , # plant-gmm
  mercatorPatternIn2 = "f106g"
  , # plant-gmm
  mercatorPatternOut2 = "F106G"
  ,
  flag1 = 4
  ,
  flag2 = 4
  ,
  flag3 = FALSE
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000027465d3e430>

child_content <- knitr::knit_child("08_fruitTrees-child1.rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./08_fruitTrees-child1.rmd

| | | 0% | |.. | 3% | |… | 6% [unnamed-chunk-248] | |….. | 9% | |…… | 12% [unnamed-chunk-249] | |…….. | 15% | |……… | 18% [unnamed-chunk-250] | |……….. | 21% | |………… | 24% [unnamed-chunk-251] | |………….. | 27% | |…………… | 30% [unnamed-chunk-252] | |…………….. | 33% | |……………… | 36% [unnamed-chunk-253] | |……………….. | 39% | |………………… | 42% [unnamed-chunk-254] | |………………….. | 45% | |…………………… | 48% [unnamed-chunk-255] | |…………………….. | 52% | |……………………… | 55% [unnamed-chunk-256] | |……………………….. | 58% | |………………………… | 61% [unnamed-chunk-257] | |………………………….. | 64% | |…………………………… | 67% [unnamed-chunk-258] | |…………………………….. | 70% | |……………………………… | 73% [unnamed-chunk-259] | |……………………………….. | 76% | |………………………………… | 79% [unnamed-chunk-260] | |………………………………….. | 82% | |…………………………………… | 85% [unnamed-chunk-261] | |…………………………………….. | 88% | |……………………………………… | 91% [unnamed-chunk-262] | |……………………………………….. | 94% | |………………………………………… | 97% [unnamed-chunk-263] | |…………………………………………..| 100%

cat(child_content)

11 Subsection: psib

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

11.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('PLAZA_selection|FastOMA2_ath-pairs|JCVI_MCScanX_plants|comparaPlants_hc-to-ath|OrthoDB_fruitTrees|RBH_fruitTrees'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'ensembl-compara') {
    
    dt = dt[dt$homology_species == plantName2, ]
    # print(head(dt))
    dt = dt[, c(1,2,6,7,10)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    df = rbind(df, dt)
    
  } else if (us == 'FastOMA') {
    
    dt = dt[dt$to_plant == plantName4, ]
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 5)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'MCScanX') {
    
    # dt = dt[grepl('stu', dt$to_plant), ]
    dt = dt[grepl(plantName3, dt$to_plant), ] #  change names
    # print(head(dt))
    dt = dt[, c(2,1, 4,3, 6)]
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 1] = NA
    dt[, 3] = NA
    df = rbind(df, dt)
    
  } else if (us == 'PLAZA') {
    
    dt = dt[dt$orthologous_species == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'OrthoDB') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  } else if (us == 'RBH') {
    
    dt = dt[dt$to_plant == plantName1, ]
    # print(head(dt))
    colnames(dt) = c('from_geneID', 'from_protID', 'to_geneID', 'to_protID', 'source')
    dt[, 2] = NA
    dt[, 4] = NA
    df = rbind(df, dt)
    
  }   else print ('ERROR: Unknown source')
}
## [1] "../intermediate/comparaPlants_hc-to-ath.txt.zip"
## [1] "../intermediate/FastOMA2_ath-pairs.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_plants.txt.zip"
## [1] "../intermediate/OrthoDB_fruitTrees.txt.zip"
## [1] "../intermediate/PLAZA_selection.txt.zip"
## [1] "../intermediate/RBH_fruitTrees.txt.zip"
table(df$source)
## 
## FastOMA MCScanX     RBH 
##   40732   29159   25288
df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID         to_protID                source 
##    <chr>       <chr>       <chr>             <chr>                    <chr>  
##  1 <NA>        AT1G12040.1 <NA>              PaF106G0100000005.01.P01 FastOMA
##  2 <NA>        AT1G62440.1 <NA>              PaF106G0100000005.01.P01 FastOMA
##  3 <NA>        AT3G07140.1 <NA>              PaF106G0800032954.01.P01 FastOMA
##  4 <NA>        AT3G07140.1 <NA>              PaF106G0800032956.01.P02 FastOMA
##  5 <NA>        AT5G58130.1 <NA>              PaF106G0100000008.01.T01 MCScanX
##  6 <NA>        AT5G58110.1 <NA>              PaF106G0100000009.01.T01 MCScanX
##  7 <NA>        AT1G33360.1 <NA>              PaF106G0800032937.01.T03 MCScanX
##  8 <NA>        AT4G10260.1 <NA>              PaF106G0800032938.01.T01 MCScanX
##  9 AT1G01030   <NA>        PaF106G0500020091 <NA>                     RBH    
## 10 AT1G01040   <NA>        PaF106G0200009357 <NA>                     RBH    
## 11 ATMG01190   <NA>        PaF106G0600023358 <NA>                     RBH    
## 12 ATMG01250   <NA>        PaF106G0700028671 <NA>                     RBH

11.2 Transcript (aka protein) to geneID

ind = which(is.na(df$from_geneID))
df$from_geneID[ind] = sub("\\.[0-9]+$", "", df$from_protID[ind])

# orfs!
ind = grep('\\.', df$from_geneID)
table(df[ind, ]$source)
## 
## MCScanX 
##       4
print(df[ind, ])
##        from_geneID     from_protID to_geneID                to_protID  source
##             <char>          <char>    <char>                   <char>  <char>
## 1: AT1G36730.uORF1 AT1G36730.uORF1      <NA> PaF106G0100005844.01.T01 MCScanX
## 2: AT4G25670.uORF1 AT4G25670.uORF1      <NA> PaF106G0200010554.01.T02 MCScanX
## 3: AT4G25690.uORF1 AT4G25690.uORF1      <NA> PaF106G0200010554.01.T02 MCScanX
## 4: AT5G52550.uORF1 AT5G52550.uORF1      <NA> PaF106G0200010554.01.T02 MCScanX
ind = which(is.na(df$to_geneID))
df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_protID[ind]) # change logic as needed



df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_protID to_geneID         to_protID                source 
##    <chr>       <chr>       <chr>             <chr>                    <chr>  
##  1 AT1G12040   AT1G12040.1 PaF106G0100000005 PaF106G0100000005.01.P01 FastOMA
##  2 AT1G62440   AT1G62440.1 PaF106G0100000005 PaF106G0100000005.01.P01 FastOMA
##  3 AT3G07140   AT3G07140.1 PaF106G0800032954 PaF106G0800032954.01.P01 FastOMA
##  4 AT3G07140   AT3G07140.1 PaF106G0800032956 PaF106G0800032956.01.P02 FastOMA
##  5 AT5G58130   AT5G58130.1 PaF106G0100000008 PaF106G0100000008.01.T01 MCScanX
##  6 AT5G58110   AT5G58110.1 PaF106G0100000009 PaF106G0100000009.01.T01 MCScanX
##  7 AT1G33360   AT1G33360.1 PaF106G0800032937 PaF106G0800032937.01.T03 MCScanX
##  8 AT4G10260   AT4G10260.1 PaF106G0800032938 PaF106G0800032938.01.T01 MCScanX
##  9 AT1G01030   <NA>        PaF106G0500020091 <NA>                     RBH    
## 10 AT1G01040   <NA>        PaF106G0200009357 <NA>                     RBH    
## 11 ATMG01190   <NA>        PaF106G0600023358 <NA>                     RBH    
## 12 ATMG01250   <NA>        PaF106G0700028671 <NA>                     RBH
summary_na = df[, .(
  na_to_geneID = sum(is.na(to_geneID)),
  na_to_protID = sum(is.na(to_protID))
), by = source]
print(summary_na)
##     source na_to_geneID na_to_protID
##     <char>        <int>        <int>
## 1: FastOMA            0            0
## 2: MCScanX            0            0
## 3:     RBH            0        25288

11.3 PLAZA and ensembl-compara with Orthofinder

here we have some loses because genes between versions do not translate well!

if (flag1 != 4 & flag2 != 4) {

  fp = file.path('..', 'input', 'OrthoFinder', plantDirIn)
  
  fl = list.files(fp)
  fn = fl[grep('Compara_', fl)] # change filename
  if (length(fn) != 0) {
    compara = data.table::fread(file.path(fp, fn))
  } else {
    compara = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  fn = fl[grep('PLAZA_', fl)] # change filename
  if (length(fn) != 0) {
    plaza = data.table::fread(file.path(fp, fn))
  } else {
    plaza = data.frame(matrix(ncol = 4, nrow = 0))
  }
  
  
  compara = compara[compara$Species == ref_genome, ] # change name
  plaza = plaza[plaza$Species == ref_genome, ] # change name
  
  
  colnames(compara)[3] = colnames(plaza)[3] = 'source'
  
  if (nrow(compara) != 0) {
    compara[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    compara[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = compara[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(compara))]
    compara = result[, seq_len := NULL]
    # compara$Ortholog = sapply(compara$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    compara$OrthoDB_ID = sub(compara_pattern_in1, compara_pattern_out1, 
                             sub(compara_pattern_in2, compara_pattern_out2, compara$OrthoDB_ID)) # change when needed
    compara = compara[!duplicated(compara), ]
    head(compara)
  }
  
  
  if (nrow(plaza) != 0) {
    plaza[, OrthoDB_list := stringr::str_split(source, pattern = ",\\s*")] # change colname
    plaza[, Orthologs_list := stringr::str_split(Orthologs, pattern = ",\\s*")]
    result = plaza[, {
      # Cartesian join of OrthoDB_list and Orthologs_list for this row
      pairs = CJ(OrthoDB_list[[1]], Orthologs_list[[1]], sorted = FALSE)
      setnames(pairs, c("OrthoDB_ID", "Ortholog"))
      pairs
    }, by = seq_len(nrow(plaza))]
    plaza = result[, seq_len := NULL]
    # plaza$Ortholog = sapply(plaza$Ortholog, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change when needed
    plaza$OrthoDB_ID = sub(plaza_pattern_in1, '', sub(plaza_pattern_in2, "", plaza$OrthoDB_ID)) # change when needed
    plaza = plaza[!duplicated(plaza), ]
    head(plaza)  
  }
  
  if (flag3) compara$Ortholog = gsub('.* ', '', compara$Ortholog) # improve if possible
  
  if (nrow(compara) != 0) {
    if (flag2 == 1) { # geneID and prot ID are completely different # make flags
      df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog)  
    } else if (flag2 == 2) {
        df_compara = dplyr::filter(df, source == "ensembl-compara") %>%
        dplyr::left_join(compara, by = c("to_protID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
        dplyr::mutate(to_geneID = Ortholog) %>%
        dplyr::select(-Ortholog) 
    } else {
      df_compara = NULL
    }
    df_compara = df_compara[!is.na(df_compara$to_geneID), ]
  }
  
  
  
  if (nrow(plaza) != 0) {
    df_plaza = dplyr::filter(df, source == "PLAZA") %>%
      dplyr::left_join(plaza, by = c("to_geneID" = "OrthoDB_ID"), relationship = "many-to-many") %>%
      dplyr::mutate(to_geneID = Ortholog) %>%
      dplyr::select(-Ortholog)
    df_plaza = df_plaza[!is.na(df_plaza$to_geneID), ]
  }
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara", "PLAZA")))  
      dt = dplyr::bind_rows(df_compara, df_plaza, df_other)
    } else {
      df_other = dplyr::filter(df, !(source %in% c("ensembl-compara")))
      dt = dplyr::bind_rows(df_compara, df_other)
    }
  } else {
    dt = df
  }
  
  
  ind = c(grep("from_geneID|to_geneID|source", colnames(dt)))
  df = dt[, ..ind]
  df = df[!duplicated(df), ]
  
  
  if (nrow(compara) != 0) {
    if (nrow(plaza) != 0) {
      ind = which(df$source %in% c('ensembl-compara', 'PLAZA'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    } else {
      ind = which(df$source %in% c('ensembl-compara'))
      df$to_geneID[ind] = sub(pattern_in, pattern_out, df$to_geneID[ind]) # change logic as needed
    }
  }
  
  
  
  
  
  df %>%
    dplyr::group_by(source) %>%
    dplyr::slice_head(n = 2) %>%
    dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
    dplyr::arrange(source) %>%
    dplyr::ungroup() -> first_last_three_per_source
  
  print(first_last_three_per_source, n = nrow(first_last_three_per_source))

} else {
  ind = c(grep("from_geneID|to_geneID|source", colnames(df)))
  df = df[, ..ind]
  
}

df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))




gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2481590 132.6    5564632 297.2  11731995  626.6
## Vcells 49730335 379.5   89555863 683.3 193111802 1473.4
library(magrittr)
# library(data.table)
library(ggplot2)
library(ComplexUpset)

11.4 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21431
length(unique(dt$to_geneID))
## [1] 19982
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   40732   16401   25288
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

11.5 Upset plot

if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName1, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

11.6 Ath ORFs

  • take care, ath cds (for MCScanX) fasta contains for e.g. besides AT1G30330.1, AT1G30330.2, AT1G30330.3
>AT1G30330.uORF1 pacid=37393466 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGATTTATTTCAGGGAAGAAGAAATAAATCTGTTTTTTTTAGGGTTTTTAGATTTGGTT
GGTGAATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAG
>AT1G30330.uORF2 pacid=37393467 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGGGTGGGAGGTGGAGGGAAACAGTTAAAAAAGTTATGCTTTTAGTGTCTCTTCTTCAT
AATTACATTTGGGCATCTTGA
>AT1G30330.uORF3 pacid=37393468 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGAAGGAGTTGAAGATTCGAAGAAGCGGTTTTGAAGTCGGCGAGACCAAGATTGCGAGC
TTATTTGGCTGA
>AT1G30330.uORF5 pacid=37393469 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCTTTTAGTGTCTCTTCTTCATAATTACATTTGGGCATCTTGA
>AT1G30330.uORF4 pacid=37393470 polypeptide= locus=AT1G30330 ID=.Araport11.447 annot-version=Araport11
ATGCCCCATATCTCTCTGTTTCTCATTTCCCGATCTTTGCATTAA
dt.wide[grep('ORF', dt.wide$from_geneID), ]
## Key: <from_geneID, to_geneID>
##        from_geneID         to_geneID FastOMA MCScanX    RBH count_evidence
##             <char>            <char>  <lgcl>  <lgcl> <lgcl>          <num>
## 1: AT1G36730.uORF1 PaF106G0100005844   FALSE    TRUE  FALSE              1
## 2: AT4G25670.uORF1 PaF106G0200010554   FALSE    TRUE  FALSE              1
## 3: AT4G25690.uORF1 PaF106G0200010554   FALSE    TRUE  FALSE              1
## 4: AT5G52550.uORF1 PaF106G0200010554   FALSE    TRUE  FALSE              1
dt.wide = dt.wide[grep('ORF', dt.wide$from_geneID, invert = TRUE), ]

11.7 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

11.8 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

11.9 fruitTrees plant gmm

fp = file.path('..', 'input', 'Mercator')
fn = mercator
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
gmm = gmm[gmm$IDENTIFIER != "''", ]

combined = gmm[, .(
  BINCODE = paste(unique(BINCODE), collapse = " | "),
  NAME = paste(unique(NAME), collapse = " | "),
  DESCRIPTION = paste(unique(DESCRIPTION), collapse = " | ")
), by = IDENTIFIER]

charToRaw(combined$IDENTIFIER[1])
##  [1] 27 70 61 66 31 30 36 67 30 36 30 30 30 32 33 31 30 39 2e 30 31 2e 70 30 31
## [26] 27
# combined$IDENTIFIER = sapply(combined$IDENTIFIER, function(x) paste(unlist(strsplit(x, "_"))[1:2], collapse = "_")) # change as needed
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# combined$IDENTIFIER = sub("[\u2018\u2019\u201C\u201D']", "", combined$IDENTIFIER, perl = TRUE)  # change as needed
# charToRaw(combined$IDENTIFIER[1])
# When the ' character appears more than once in a string (such as at both start and end), sub() will remove only one occurrence per call.
combined$IDENTIFIER = gsub(mercatorPatternIn1, mercatorPatternOut1, combined$IDENTIFIER, perl = TRUE)  # change as needed
charToRaw(combined$IDENTIFIER[1])
##  [1] 70 61 66 31 30 36 67 30 36 30 30 30 32 33 31 30 39 2e 30 31 2e 70 30 31
combined$IDENTIFIER = paste0(toupper(substring(combined$IDENTIFIER, 1, 1)), substring(combined$IDENTIFIER, 2))  # change as needed
combined$IDENTIFIER = gsub(mercatorPatternIn2, mercatorPatternOut2, combined$IDENTIFIER, perl=TRUE) # change as needed;
combined$IDENTIFIER = sub(pattern_in, pattern_out, combined$IDENTIFIER, perl=TRUE)
table(combined$IDENTIFIER %in% dt$to_geneID)
## 
## FALSE  TRUE 
## 12977 19982
combined$BINCODE = sub("\\'", '', combined$BINCODE )
combined$NAME = sub("\\'", '', combined$NAME)
combined$DESCRIPTION = sub("\\'", '', combined$DESCRIPTION)

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "athName"        
## [13] "athSynonims"     "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 53637
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName1", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag1", "flag2")))


gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  1972911 105.4    5564632 297.2  11731995  626.6
## Vcells 20962544 160.0   71644691 546.7 193111802 1473.4
library(magrittr)
library(ggplot2)
library(ComplexUpset)

11.10 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

11.11 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 21427
length(unique(df$to_geneID))
## [1] 19981
range(df$from_count)
## [1]  1 58
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 132
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 105
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag1 == 1) {
  methods = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  methods = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          44386           7389           1862
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       25268     6325           1687
##   2       11048      781            105
##   3        8070      283             70
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag1 != 4 & flag2 != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
##     RBH_MapMan4 FastOMA_MapMan4 
##            4287            4555
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##            4555            5843           16397            4287           22555
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

11.12 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          28423           2000            659
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       10831     1009            504
##   2        9522      708             85
##   3        8070      283             70
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|ensembl-compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "ensembl-compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag1 == 1) {
  source_cols = c("MCScanX", "ensembl-compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 2) {  # make flags
  source_cols = c("MCScanX", "ensembl-compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag1 == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}





# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/fruitTrees/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 18858
length(unique(kept$to_geneID))
## [1] 19107
range(kept$from_count)
## [1]  1 46
range(kept$to_count)
## [1]  1 96
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 7
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 16
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

11.13 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             170             179             674             129             836
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/fruitTrees/', plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
# Step 1: params_list
# params_list <- list(
# ...
# )
# 
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
#   plantName1: NULL
#   plantName2: NULL
#   plantName3: NULL
#   plantName4: NULL
#   plantDirIn: NULL
#   plantNameOut: NULL
#   plantDirOut: NULL
#   pattern_in: NULL
#   pattern_out: NULL
#   compara_pattern_in1: NULL
#   compara_pattern_in2: NULL
#   plaza_pattern_in1: NULL
#   plaza_pattern_in2: NULL
#   ref_genome: NULL
#   mercator: NULL
#   mercatorPatternIn1: NULL
#   mercatorPatternOut1: NULL
#   mercatorPatternIn2: NULL
#   mercatorPatternOut2: NULL
# ---
# 
# 
# access params in the script like:
# params$plantName1
# params$plantDirOut
# 
# Step 3: Call render() like
# rmarkdown::render(
#   input = "09_fruitTrees-child.Rmd",
#   params = params_list,
#   envir = new.env(parent = globalenv()),  # optional: isolate execution
#   output_format = "html_document",
#   quiet = FALSE
# )
# 
# 
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory

12 SessionInfo

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## time zone: Europe/Ljubljana
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2      knitr_1.50         data.table_1.17.0 
## [5] magrittr_2.0.3    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.4.1     zip_2.3.2          Rcpp_1.0.14        tidyselect_1.2.1  
##  [9] stringr_1.5.1      dichromat_2.0-0.1  jquerylib_0.1.4    textshaping_1.0.1 
## [13] systemfonts_1.2.3  scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
## [17] R6_2.6.1           labeling_0.4.3     patchwork_1.3.0    generics_0.1.4    
## [21] igraph_2.1.4       openxlsx_4.2.8     tibble_3.2.1       bslib_0.9.0       
## [25] pillar_1.10.2      RColorBrewer_1.1-3 rlang_1.1.5        utf8_1.2.5        
## [29] cachem_1.1.0       stringi_1.8.7      xfun_0.52          sass_0.4.10       
## [33] cli_3.6.3          withr_3.0.2        digest_0.6.37      grid_4.4.1        
## [37] rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.3    
## [41] glue_1.8.0         farver_2.1.2       ragg_1.4.0         colorspace_2.1-1  
## [45] rmarkdown_2.29     tools_4.4.1        pkgconfig_2.0.3    htmltools_0.5.8.1